You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/03/28 19:45:53 UTC
[opennlp] branch master updated: OPENNLP-565 Support for the MASC format (#364)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 4b69d7c  OPENNLP-565 Support for the MASC format (#364)
4b69d7c is described below

commit 4b69d7c82b5d8cf4a67aca4d061745a95b510ada
Author: Jiri Zamecnik <24...@users.noreply.github.com>
AuthorDate: Mon Mar 28 21:45:48 2022 +0200

    OPENNLP-565 Support for the MASC format (#364)
    
    * Support for the MASC format
---
 .../tools/cmdline/StreamFactoryRegistry.java       |   9 +
 .../opennlp/tools/formats/masc/MascDocument.java   | 444 +++++++++++++++++++++
 .../tools/formats/masc/MascDocumentStream.java     | 235 +++++++++++
 .../tools/formats/masc/MascNamedEntityParser.java  | 101 +++++
 .../formats/masc/MascNamedEntitySampleStream.java  | 101 +++++
 .../masc/MascNamedEntitySampleStreamFactory.java   |  74 ++++
 .../tools/formats/masc/MascPOSSampleStream.java    |  92 +++++
 .../formats/masc/MascPOSSampleStreamFactory.java   |  76 ++++
 .../tools/formats/masc/MascPennTagParser.java      | 112 ++++++
 .../opennlp/tools/formats/masc/MascSentence.java   | 341 ++++++++++++++++
 .../tools/formats/masc/MascSentenceParser.java     |  64 +++
 .../formats/masc/MascSentenceSampleStream.java     |  94 +++++
 .../masc/MascSentenceSampleStreamFactory.java      |  81 ++++
 .../java/opennlp/tools/formats/masc/MascToken.java |  83 ++++
 .../tools/formats/masc/MascTokenSampleStream.java  | 112 ++++++
 .../formats/masc/MascTokenSampleStreamFactory.java |  82 ++++
 .../java/opennlp/tools/formats/masc/MascWord.java  |  42 ++
 .../opennlp/tools/formats/masc/MascWordParser.java |  63 +++
 .../masc/MascNamedEntitySampleStreamTest.java      | 167 ++++++++
 .../formats/masc/MascPOSSampleStreamTest.java      | 155 +++++++
 .../formats/masc/MascSentenceSampleStreamTest.java | 165 ++++++++
 .../formats/masc/MascTokenSampleStreamTest.java    | 175 ++++++++
 .../opennlp/tools/formats/masc/fakeMASC-ne.xml     |  20 +
 .../opennlp/tools/formats/masc/fakeMASC-penn.xml   | 145 +++++++
 .../opennlp/tools/formats/masc/fakeMASC-s.xml      |  29 ++
 .../opennlp/tools/formats/masc/fakeMASC-seg.xml    |  20 +
 .../opennlp/tools/formats/masc/fakeMASC.hdr        |  37 ++
 .../opennlp/tools/formats/masc/fakeMASC.txt        |   1 +
 28 files changed, 3120 insertions(+)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index db95a4f..215c80b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -59,6 +59,10 @@ import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFa
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
 import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
+import opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory;
+import opennlp.tools.formats.masc.MascPOSSampleStreamFactory;
+import opennlp.tools.formats.masc.MascSentenceSampleStreamFactory;
+import opennlp.tools.formats.masc.MascTokenSampleStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
 import opennlp.tools.formats.nkjp.NKJPSentenceSampleStreamFactory;
@@ -130,6 +134,11 @@ public final class StreamFactoryRegistry {
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
     LeipzigLanguageSampleStreamFactory.registerFactory();
     NKJPSentenceSampleStreamFactory.registerFactory();
+
+    MascNamedEntitySampleStreamFactory.registerFactory();
+    MascPOSSampleStreamFactory.registerFactory();
+    MascSentenceSampleStreamFactory.registerFactory();
+    MascTokenSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java
new file mode 100644
index 0000000..0e7af1a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocument.java
@@ -0,0 +1,444 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.SAXParser;
+
+import org.xml.sax.SAXException;
+
+import opennlp.tools.util.Span;
+import opennlp.tools.util.XmlUtil;
+
+
+public class MascDocument {
+
+  private final List<MascSentence> sentences;
+  private final String pathToFile;
+  private Iterator<MascSentence> sentenceIterator;
+  private boolean hasPennTags = false;
+  private boolean hasNamedEntities = false;
+
+  public MascDocument(String path, List<MascSentence> sentences) {
+    this.pathToFile = path;
+    this.sentences = sentences;
+    this.sentenceIterator = sentences.iterator();
+  }
+
+  /**
+   * Creates a MASC document with all of the stand-off annotations translated into the internal
+   * structure.
+   *
+   * @param path      The path where the document header is.
+   * @param f_primary The file with the raw corpus text.
+   * @param f_seg     The file with segmentation into quarks.
+   * @param f_ne      The file with named entities.
+   * @param f_penn    The file with tokenization and Penn POS tags produced
+   *                  by GATE-5.0 ANNIE application.
+   * @param f_s       The file with sentence boundaries.
+   * @return A document containing the text and its annotations. Immutability is not guaranteed yet.
+   * @throws IOException if the raw data cannot be read or the alignment of the raw data
+   *                     with annotations fails
+   */
+  public static MascDocument parseDocument(String path, InputStream f_primary, InputStream f_seg,
+                                           InputStream f_penn, InputStream f_s, InputStream f_ne)
+      throws IOException {
+
+    String text = readText(f_primary);
+    List<MascWord> words = parseWords(f_seg);
+    List<Span> sentenceSpans = parseSentences(f_s);
+
+    List<MascSentence> sentences = combineAnnotations(text, sentenceSpans, words);
+    MascDocument doc = new MascDocument(path, sentences);
+
+    // if the file has Penn POS tags, add them
+    if (f_penn != null) {
+      doc.addPennTags(parsePennTags(f_penn));
+    }
+
+    if (f_ne != null) {
+      doc.addNamedEntityTags(parseNamedEntity(f_ne));
+    }
+
+    //todo: make the annotations immutable
+    //todo: should we cleanup the document (e.g. remove sentences without tokens?)
+    return doc;
+  }
+
+  /**
+   * Read in the corpus file text
+   *
+   * @param stream The corpus file
+   * @return The text of the file
+   * @throws IOException if anything goes wrong
+   */
+  private static String readText(InputStream stream) throws IOException {
+    try {
+      Reader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+      StringBuilder contents = new StringBuilder();
+      char[] buffer = new char[8192];
+      int read;
+      while ((read = reader.read(buffer, 0, buffer.length)) > 0) {
+        contents.append(buffer, 0, read);
+      }
+      return contents.toString();
+    } finally {
+      // this may throw an exception
+      stream.close();
+    }
+  }
+
+
+  /**
+   * Parses the word segmentation stand-off annotation
+   *
+   * @param f_seg The file with segmentation
+   * @return A list of individual quarks, expressed as MascWord-s
+   * @throws IOException if anything goes wrong
+   */
+  private static List<MascWord> parseWords(InputStream f_seg) throws IOException {
+
+    try {
+      SAXParser saxParser = XmlUtil.createSaxParser();
+      MascWordParser handler = new MascWordParser();
+      try {
+        saxParser.parse(f_seg, handler);
+      } catch (SAXException e) {
+        throw new IOException("Could not parse the region annotation file");
+      }
+
+      return Collections.unmodifiableList(handler.getAnchors());
+
+    } finally {
+      f_seg.close();
+    }
+  }
+
+  /**
+   * Parse the sentence annotation file, align it with the raw text
+   *
+   * @param f_s the sentence annotation file
+   * @return the list of Spans delimiting each sentence
+   * @throws IOException if the sentence file cannot be parsed or closed
+   */
+  private static List<Span> parseSentences(InputStream f_s) throws IOException {
+
+    try {
+      SAXParser saxParser = XmlUtil.createSaxParser();
+      MascSentenceParser handler = new MascSentenceParser();
+      try {
+        saxParser.parse(f_s, handler);
+      } catch (SAXException e) {
+        throw new IOException("Could not parse the sentence annotation file");
+      }
+
+      List<Span> anchors = handler.getAnchors();
+
+      /*Filter out sentence overlaps.
+      Keep only those sentences  where sentence.end < nextsentence.beginning
+      avoid deleting in the middle and repeatedly shifting the list by copying into a new list*/
+      //todo: can we know a priori, if we need this filtering?
+      List<Span> filteredAnchors = new ArrayList<>();
+      for (int i = 0; i < anchors.size() - 1; i++) {
+        if (anchors.get(i).getEnd() < anchors.get(i + 1).getStart()) {
+          filteredAnchors.add(anchors.get(i));
+        }
+      }
+      filteredAnchors.add(anchors.get(anchors.size() - 1));
+
+      return Collections.unmodifiableList(filteredAnchors);
+
+    } finally {
+      f_s.close();
+    }
+
+  }
+
+  /**
+   * Parses the Penn-POS (GATE5-ANNIE) stand-off annotation
+   *
+   * @param f_penn The file with Penn POS tags
+   * @return A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
+   * tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
+   * (int) to a List of quark IDs contained in that token.
+   * @throws IOException if anything goes wrong
+   */
+  private static Map<String, Map> parsePennTags(InputStream f_penn) throws IOException {
+    Map<String, Map> tagsAndBases = new HashMap<>();
+
+    try {
+      SAXParser saxParser = XmlUtil.createSaxParser();
+      MascPennTagParser handler = new MascPennTagParser();
+      try {
+        saxParser.parse(f_penn, handler);
+      } catch (SAXException e) {
+        throw new IOException("Could not parse the Penn tag annotation file");
+      }
+
+      tagsAndBases.put("tokenToTag", handler.getTags());
+      tagsAndBases.put("tokenToBase", handler.getBases());
+      tagsAndBases.put("tokenToQuarks", handler.getTokenToQuarks());
+
+      return tagsAndBases;
+
+    } finally {
+      f_penn.close();
+    }
+  }
+
+  /**
+   * Parses the named entity stand-off annotation
+   *
+   * @param f_ne The file with named entity annotations
+   * @return A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
+   * to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
+   * token ID integers
+   * @throws IOException if anything goes wrong
+   */
+  private static Map<String, Map> parseNamedEntity(InputStream f_ne) throws IOException {
+
+    try {
+      SAXParser saxParser = XmlUtil.createSaxParser();
+      MascNamedEntityParser handler = new MascNamedEntityParser();
+      try {
+        saxParser.parse(f_ne, handler);
+      } catch (SAXException e) {
+        System.out.println(e.getMessage());
+        throw new IOException("Could not parse the named entity annotation file");
+      }
+
+      Map<Integer, String> entityIDtoEntityType = handler.getEntityIDtoEntityType();
+      Map<Integer, List<Integer>> entityIDsToTokens = handler.getEntityIDsToTokens();
+      Map<String, Map> results = new HashMap<>();
+      results.put("entityIDtoEntityType", entityIDtoEntityType);
+      results.put("entityIDsToTokens", entityIDsToTokens);
+      return results;
+
+    } finally {
+      f_ne.close();
+    }
+  }
+
+  /**
+   * Combines the raw text with annotations that every file should have.
+   *
+   * @param text          The raw text.
+   * @param sentenceSpans The spans definining individual sentences. Overlaps are not permitted.
+   * @param words         The quarks of the raw text.
+   * @return A list of sentences, each of which is a list of quarks. Some quarks may belong to
+   * more than one sentence. Quarks which do not belong to a single sentence are silently dropped.
+   * @throws IOException If sentences and quarks cannot be aligned.
+   */
+  private static List<MascSentence> combineAnnotations(String text,
+                                                       List<Span> sentenceSpans,
+                                                       List<MascWord> words) throws IOException {
+
+    int wordIndex = 0;
+    int wordCount = words.size();
+    List<MascSentence> sentences = new ArrayList<>();
+    for (Span s : sentenceSpans) {
+      if (s.getEnd() - s.getStart() > 0) {
+        List<MascWord> quarks = new ArrayList<>();
+        int sentenceStart = s.getStart();
+        int sentenceEnd = s.getEnd();
+
+        //todo: is it okay that quarks can cross sentence boundary? What are the implications?
+        /*
+        Allow quarks to cross sentence boundary.
+        The decisive factor determining if a quark belongs to a sentence is if they overlap.
+        I.e. sent.getEnd() > quark.getStart() && sent.getStart() < quark.getEnd()
+         */
+        MascWord nextWord = words.get(wordIndex);
+        //Find sentence beginning, should not be needed unless overlaps occur
+        while (sentenceStart < nextWord.getEnd() && wordIndex > 0) {
+          wordIndex--;
+          nextWord = words.get(wordIndex);
+        }
+
+        //todo: can this be translated into Span's methods .crosses()/.contains()?
+        //find all quarks contained or crossing the span of that sentence
+        boolean sentenceOver = false;
+        while ((!sentenceOver) && wordIndex < wordCount) {
+          nextWord = words.get(wordIndex);
+          int nextWordStart = nextWord.getStart();
+          int nextWordEnd = nextWord.getEnd();
+          // word either ends or starts or ends & starts in the middle of sentence
+          if (sentenceEnd > nextWordStart && sentenceStart < nextWordEnd) {
+            quarks.add(nextWord);
+            if (sentenceEnd == nextWordEnd) {
+              sentenceOver = true;
+            }
+            wordIndex++;
+          } else if (sentenceEnd <= nextWordStart) {
+            sentenceOver = true;
+          } else {
+            wordIndex++;
+          }
+        }
+
+        // If we are at the end of words, but not in the last sentence, throw an error
+        if (!sentenceOver && sentences.size() != sentenceSpans.size() - 1) {
+          throw new IOException("Sentence ends and word ends do not match." +
+              "First sentence not completed ends at character: " + sentenceEnd);
+        }
+
+        MascSentence sentence = new MascSentence(sentenceStart, sentenceEnd, text, quarks,
+            words);
+        sentences.add(sentence);
+      }
+    }
+
+    return Collections.unmodifiableList(sentences);
+
+  }
+
+
+  /**
+   * Attach the named entity labels to individual tokens
+   *
+   * @param namedEntities A map with two sub-maps, entityIDtoEntityType, mapping entity ID integers
+   *                      * to entity type Strings, and entityIDsToTokens, mapping entity ID integers to Penn
+   *                      * token ID integers
+   */
+  private void addNamedEntityTags(Map<String, Map> namedEntities) {
+    try {
+      Map<Integer, String> entityIDtoEntityType = namedEntities.get("entityIDtoEntityType");
+      Map<Integer, List<Integer>> entityIDsToTokens = namedEntities.get("entityIDsToTokens");
+
+      for (MascSentence s : sentences) {
+        boolean success = s.addNamedEntities(entityIDtoEntityType, entityIDsToTokens);
+        if (!success) {
+          System.out.println("\tIssues occurred in the file: " + pathToFile);
+        }
+      }
+      hasNamedEntities = true;
+    } catch (IOException e) {
+      System.err.println("[ERROR] Failed connecting tokens and named entities.");
+      System.err.println("\tThe error occurred in the file: " + pathToFile);
+      System.err.println(e.getMessage());
+      System.err.println(Arrays.toString(e.getStackTrace()));
+    }
+  }
+
+
+  /**
+   * Attach tags and bases to MascWords in each of the sentences.
+   *
+   * @param tagMaps A map of three sub-maps: tokenToTag, from Penn token ID (int) to Penn POS-tag,
+   *                * tokenToBase, from Penn token ID (int) to the base and tokenToQuarks, from Penn token ID
+   *                * (int) to a List of quark IDs contained in that token.
+   */
+  private void addPennTags(Map<String, Map> tagMaps) throws IOException {
+    try {
+      // Extract individual mappings
+      Map<Integer, String> tokenToTag = tagMaps.get("tokenToTag");
+      Map<Integer, String> tokenToBase = tagMaps.get("tokenToBase");
+      Map<Integer, int[]> tokenToQuarks = tagMaps.get("tokenToQuarks");
+
+      //Check that all tokens have at least one quark.
+      for (Map.Entry<Integer, int[]> token : tokenToQuarks.entrySet()) {
+        if (token.getValue().length == 0) {
+          System.err.println("[ERROR] Token without quarks: " + token.getKey());
+        }
+      }
+
+      Map<Integer, int[]> quarkToTokens = new HashMap<>();
+      for (Map.Entry<Integer, int[]> tokenAndQuarks : tokenToQuarks.entrySet()) {
+        int token = tokenAndQuarks.getKey();
+        int[] quarks = tokenAndQuarks.getValue();
+        for (int quark : quarks) {
+          //very rarely, one quark may belong to several token
+          //this is probably a mistake in the corpus annotation
+          if (quarkToTokens.containsKey(quark)) {
+            int[] tokens = quarkToTokens.get(quark);
+            int[] newTokens = new int[tokens.length + 1];
+            newTokens[0] = token;
+            System.arraycopy(tokens, 0, newTokens, 1, tokens.length);
+            System.out.println("[WARNING] One quark belongs to several tokens. f-seg ID: " +
+                quark);
+            System.out.println("\tThe error occurred in file: " + pathToFile);
+            quarkToTokens.put(quark, newTokens);
+          } else {
+            quarkToTokens.put(quark, new int[] {token});
+          }
+        }
+      }
+
+      for (MascSentence s : sentences) {
+        boolean success = s.tokenizePenn(tokenToQuarks, quarkToTokens, tokenToBase, tokenToTag);
+        if (!success) {
+          System.out.println("\tIssue occurred in file: " + pathToFile);
+        }
+      }
+
+      hasPennTags = true;
+
+    } catch (Exception e) {
+      throw new IOException("Could not attach POS tags to words. " +
+          e.getMessage() + Arrays.toString(e.getStackTrace()));
+    }
+  }
+
+
+  /**
+   * Check whether there is Penn tagging produced by GATE-5.0 ANNIE
+   *
+   * @return true if this file has aligned tags/tokens
+   */
+  public boolean hasPennTags() {
+    return hasPennTags;
+  }
+
+  public boolean hasNamedEntities() {
+    return hasNamedEntities;
+  }
+
+  /**
+   * Get next sentence.
+   *
+   * @return Next sentence or null if end of document reached.
+   */
+  public MascSentence read() {
+    MascSentence next = null;
+    if (sentenceIterator.hasNext()) {
+      next = sentenceIterator.next();
+    }
+    return next;
+  }
+
+  /**
+   * Return the reading of sentences to the beginning of the document.
+   */
+  public void reset() {
+    this.sentenceIterator = this.sentences.iterator();
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java
new file mode 100644
index 0000000..4dffcf4
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascDocumentStream.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import javax.xml.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.XmlUtil;
+
+public class MascDocumentStream implements ObjectStream<MascDocument> {
+
+  /**
+   * A helper class to parse the header (.hdr) files.
+   */
+  private class HeaderHandler extends DefaultHandler {
+    private HashMap<String, String> annotationFiles = null;
+    private String file = null;
+    private String fType = null;
+
+    protected HashMap<String, String> getPathList() {
+      return annotationFiles;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes)
+        throws SAXException {
+
+      // create a new annotation file and put it in map
+      // initialize File object and set path attribute
+      if (qName.equalsIgnoreCase("annotation") ||
+          qName.equalsIgnoreCase("primaryData")) {
+        file = attributes.getValue("loc");
+        fType = attributes.getValue("f.id");
+
+        // initialize list
+        if (annotationFiles == null) {
+          annotationFiles = new HashMap<>();
+        }
+      }
+
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+
+      // add annotation object to list
+      if (qName.equalsIgnoreCase("annotation") ||
+          qName.equalsIgnoreCase("primaryData")) {
+        annotationFiles.put(fType, file);
+      }
+
+    }
+
+  }
+  private List<MascDocument> documents = new LinkedList<>();
+  private Iterator<MascDocument> documentIterator;
+  private SAXParser saxParser;
+
+  public MascDocumentStream(File mascCorpusDirectory) throws IOException {
+    FileFilter fileFilter = pathname -> pathname.getName().contains("");
+    new MascDocumentStream(mascCorpusDirectory, true, fileFilter);
+  }
+
+  /**
+   * Creates a MascDocumentStream to read the documents from a given directory.
+   * Works iff all annotation files mentioned in the headers are present.
+   *
+   * @param mascCorpusDirectory the directory containing all the MASC files
+   * @param searchRecursive     whether the search should go through subdirectories
+   * @param fileFilter          a custom file filter to filter out some files or
+   *                            null to accept anything
+   * @throws IOException if any stage of the stream creation fails
+   */
+  public MascDocumentStream(File mascCorpusDirectory,
+                            boolean searchRecursive, FileFilter fileFilter) throws IOException {
+
+    saxParser = XmlUtil.createSaxParser();
+
+    if (!mascCorpusDirectory.isDirectory()) {
+      throw new IOException("Input corpus directory must be a directory " +
+          "according to File.isDirectory()!");
+    }
+
+    int failedLoads = 0;
+    Stack<File> directoryStack = new Stack<>();
+    directoryStack.add(mascCorpusDirectory);
+
+    while (!directoryStack.isEmpty()) {
+      for (File file : directoryStack.pop().listFiles(fileFilter)) {
+        if (file.isFile()) {
+          String hdrFilePath = file.getAbsolutePath();
+
+          // look for the header files
+          if (hdrFilePath.endsWith(".hdr")) {
+
+            HashMap<String, File> fileGroup = checkAnnotations(hdrFilePath);
+            InputStream f_primary = new BufferedInputStream(
+                new FileInputStream(fileGroup.get("f.text")));
+            InputStream f_seg = (fileGroup.containsKey("f.seg")) ?
+                new BufferedInputStream(new FileInputStream(fileGroup.get("f.seg"))) : null;
+            InputStream f_penn = (fileGroup.containsKey("f.penn")) ?
+                new BufferedInputStream(new FileInputStream(fileGroup.get("f.penn"))) : null;
+            InputStream f_s = (fileGroup.containsKey("f.s")) ?
+                new BufferedInputStream(new FileInputStream(fileGroup.get("f.s"))) : null;
+            InputStream f_ne = (fileGroup.containsKey("f.ne")) ?
+                new BufferedInputStream(new FileInputStream(fileGroup.get("f.ne"))) : null;
+
+            try {
+              documents.add(MascDocument.parseDocument(hdrFilePath, f_primary, f_seg,
+                  f_penn, f_s, f_ne));
+            } catch (IOException e) {
+              System.err.println("Failed to parse the file: " + hdrFilePath);
+              System.err.println('\t' + e.getMessage());
+              failedLoads++;
+            }
+          }
+
+        } else if (searchRecursive && file.isDirectory()) {
+          directoryStack.push(file);
+        }
+      }
+    }
+
+    System.out.println("Documents loaded: " + documents.size());
+    if (failedLoads > 0) {
+      System.err.println("Failed loading " + failedLoads + " documents.");
+    }
+    reset();
+
+  }
+
+  /**
+   * Check that all annotation files mentioned in the header are present
+   *
+   * @param path The path to header
+   * @throws IOException If corpus integrity is violated
+   */
+  private HashMap<String, File> checkAnnotations(String path) throws IOException {
+    HeaderHandler handler = new HeaderHandler();
+    HashMap<String, File> fileGroup = new HashMap<>();
+    File hdrFile = new File(path);
+    try {
+      saxParser.parse(hdrFile, handler);
+    } catch (SAXException e) {
+      throw new IOException("Invalid corpus format. " +
+          "Could not parse the header: " + path);
+    }
+    HashMap<String, String> annotationFiles = handler.getPathList();
+
+    String pathToFolder = hdrFile.getParentFile().getAbsolutePath();
+    for (Map.Entry<String, String> annotation : annotationFiles.entrySet()) {
+      File file = new File(pathToFolder, annotation.getValue());
+      if (!(file.isFile() && file.exists())) {
+        throw new IOException("Corpus integrity violated. " +
+            "Annotation file " + file.getAbsolutePath() + " is missing.");
+      }
+
+      fileGroup.put(annotation.getKey(), file);
+
+    }
+
+    return fileGroup;
+
+  }
+
+  /**
+   * Reset the reading of all documents to the first sentence.
+   * Reset the corpus to the first document.
+   */
+  public void reset() {
+    for (MascDocument doc : documents) {
+      doc.reset();
+    }
+    documentIterator = documents.iterator();
+  }
+
+  /**
+   * Return the next document. Client needs to check if this document has the necessary annotations.
+   *
+   * @return A corpus document with all its annotations.
+   * @throws IOException if anything goes wrong.
+   */
+  public MascDocument read() throws IOException {
+
+    MascDocument doc = null;
+
+    if (documentIterator.hasNext()) {
+      doc = documentIterator.next();
+    }
+
+    return doc;
+  }
+
+  /**
+   * Remove the corpus from the memory.
+   */
+  public void close() {
+    documents = null;
+    documentIterator = null;
+  }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java
new file mode 100644
index 0000000..c1e22de
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntityParser.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A class to process the MASC Named entity stand-off annotation file
+ */
+public class MascNamedEntityParser extends DefaultHandler {
+
+  private Map<Integer, String> entityIDtoEntityType = new HashMap<>();
+  private Map<Integer, List<Integer>> entityIDsToTokens = new HashMap<>();
+  private Map<Integer, String> tokenToEntity = new HashMap<>();
+
+  public Map<Integer, String> getEntityIDtoEntityType() {
+    return entityIDtoEntityType;
+  }
+
+  public Map<Integer, List<Integer>> getEntityIDsToTokens() {
+    return entityIDsToTokens;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName, Attributes attributes)
+      throws SAXException {
+
+    try {
+      if (qName.equals("a")) {
+        int entityID = Integer.parseInt(
+            attributes.getValue("ref").replaceFirst("ne-n", ""));
+        String label = attributes.getValue("label");
+        if (entityIDtoEntityType.containsKey(entityID)) {
+          throw new SAXException("Multiple labels for one named entity");
+        } else {
+          entityIDtoEntityType.put(entityID, label);
+        }
+      }
+
+      if (qName.equals("edge")) {
+        int entityID = Integer.parseInt(
+            attributes.getValue("from").replaceFirst("ne-n", ""));
+        int tokenID = Integer.parseInt(
+            attributes.getValue("to").replaceFirst("penn-n", ""));
+
+        if (!entityIDsToTokens.containsKey(entityID)) {
+          List<Integer> tokens = new ArrayList<>();
+          tokens.add(tokenID);
+          entityIDsToTokens.put(entityID, tokens);
+        } else {
+          entityIDsToTokens.get(entityID).add(tokenID);
+        }
+
+/*      Not sure what to do with this. There might be multiple entity links to one token.
+       E.g. Colorado will be one token with the entities "city" and "province".
+       For now, we'll only raise alarm when one TokenID should be assigned
+       to different top-level labels, e.g. person & location (since we are dropping the low-level
+       annotations at the moment). To make this work in OpenNLP (does not allow overlaps), we'll
+       keep only the first named entity type.
+ */
+        //todo: Do we want to give the user control over which types have priority?
+        String type = entityIDtoEntityType.get(entityID);
+        if (tokenToEntity.containsKey(tokenID) && !type.equals(tokenToEntity.get(tokenID))) {
+          System.out.println("[WARNING] One token assigned to different named entity types.\n" +
+              "\tPenn-TokenID: " + tokenID + "\n\tToken types: \"" + type + "\", \"" +
+              tokenToEntity.get(tokenID) + "\"\n\tKeeping only " + "\"type\"");
+          int i = 0;
+        }
+        tokenToEntity.put(tokenID, type);
+      }
+
+    } catch (Exception e) {
+      throw new SAXException("Could not parse the named entity annotation file.\n" +
+          e.getMessage(), e);
+    }
+  }
+
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java
new file mode 100644
index 0000000..dd7c6da
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStream.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascNamedEntitySampleStream extends FilterObjectStream<MascDocument, NameSample> {
+
+  MascDocument buffer;
+
+  /**
+   * Create a stream of named entity samples from a stream of MascDocuments
+   *
+   * @param samples a MascDocumentStream
+   * @throws IOException
+   */
+  public MascNamedEntitySampleStream(ObjectStream<MascDocument> samples) throws IOException {
+    super(samples);
+    try {
+      do {
+        buffer = samples.read();
+      } while (!buffer.hasNamedEntities());
+    } catch (Exception e) {
+      throw new IOException("None of the documents has named entity labels" +
+          e.getMessage());
+    }
+  }
+
+  /**
+   * Get the next sample of named entities.
+   *
+   * @return One sentence together with its named entity annotation
+   * @throws IOException if the sample cannot be extracted
+   */
+  public NameSample read() throws IOException {
+
+    /* Read the documents one sentence at a time
+    If the document is over, move to the next one
+    If both document stream and sentence stream are over, return null
+     */
+    try {
+      MascSentence sentence = buffer.read();
+      while (sentence == null) {
+        buffer = samples.read();
+        if (buffer == null) {
+          return null;
+        }
+        if (buffer.hasNamedEntities()) {
+          sentence = buffer.read();
+        }
+      }
+
+      List<String> tokens = sentence.getTokenStrings();
+      String[] tokensArray = new String[tokens.size()];
+      tokens.toArray(tokensArray);
+
+      List<Span> namedEntities = sentence.getNamedEntities();
+      Span[] namedEntitiesArray = new Span[namedEntities.size()];
+      namedEntities.toArray(namedEntitiesArray);
+
+      //todo: should the user decide about clearAdaptiveData?
+      return new NameSample(tokensArray, namedEntitiesArray, true);
+
+    } catch (IOException e) {
+      throw new IOException("Could not get a sample of named entities from the data.");
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    samples.close();
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    samples.reset();
+    buffer = samples.read();
+  }
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java
new file mode 100644
index 0000000..49bf94b
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascNamedEntitySampleStreamFactory extends AbstractSampleStreamFactory<NameSample> {
+  public static final String MASC_FORMAT = "masc";
+
+  protected <P> MascNamedEntitySampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class,
+        MASC_FORMAT,
+        new opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory(
+            opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters.class));
+  }
+
+  @Override
+  public ObjectStream<NameSample> create(String[] args) {
+    opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters params =
+        ArgumentParser.parse(args,
+            opennlp.tools.formats.masc.MascNamedEntitySampleStreamFactory.Parameters.class);
+
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+      return new MascNamedEntitySampleStream(
+          new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+    } catch (IOException e) {
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+
+  interface Parameters extends BasicFormatParams {
+
+    @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+        description = "search through files recursively")
+    boolean getRecurrentSearch();
+
+    @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+        description = "only include files which contain a given string in their name")
+    String getFileFilter();
+
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java
new file mode 100644
index 0000000..7d7b295
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStream.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class MascPOSSampleStream extends FilterObjectStream<MascDocument, POSSample> {
+
+  MascDocument buffer;
+
+  /**
+   * Create a stream of POS-samples from a stream of MascDocuments.
+   *
+   * @param samples A MascDocumentStream.
+   * @throws IOException
+   */
+  public MascPOSSampleStream(ObjectStream<MascDocument> samples) throws IOException {
+    super(samples);
+    try {
+      do {
+        buffer = samples.read();
+      } while (!buffer.hasPennTags()); // For now, we'll always use Penn tags
+    } catch (Exception e) {
+      throw new IOException("None of the documents has POS tags" +
+          e.getMessage());
+    }
+  }
+
+  /**
+   * Get the next sample
+   *
+   * @return One sentence together with its POS tags.
+   * @throws IOException if anything goes wrong.
+   */
+  public POSSample read() throws IOException {
+
+    /* Read the documents one sentence at a time
+    If the document is over, move to the next one
+    If both document stream and sentence stream are over, return null
+     */
+    try {
+      MascSentence sentence = buffer.read();
+      while (sentence == null) {
+        buffer = samples.read();
+        if (buffer == null) {
+          return null;
+        }
+        if (buffer.hasPennTags()) {
+          sentence = buffer.read();
+        }
+      }
+
+      List<String> tokens = sentence.getTokenStrings();
+      List<String> POStags = sentence.getTags();
+      return new POSSample(tokens, POStags);
+
+    } catch (IOException e) {
+      throw new IOException("Could not get a sample of POS tags from the data.");
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    samples.close();
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    samples.reset();
+    buffer = samples.read();
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java
new file mode 100644
index 0000000..c3aa216
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPOSSampleStreamFactory.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+  public static final String MASC_FORMAT = "masc";
+
+  protected <P> MascPOSSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(POSSample.class,
+        MASC_FORMAT,
+        new opennlp.tools.formats.masc.MascPOSSampleStreamFactory(
+            opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters.class));
+  }
+
+  @Override
+  public ObjectStream<POSSample> create(String[] args) {
+    opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters params =
+        ArgumentParser.parse(args,
+            opennlp.tools.formats.masc.MascPOSSampleStreamFactory.Parameters.class);
+
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+      return new MascPOSSampleStream(
+          new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+
+  interface Parameters extends BasicFormatParams {
+
+    @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+        description = "search through files recursively")
+    boolean getRecurrentSearch();
+
+    @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+        description = "only include files which contain a given string in their name")
+    String getFileFilter();
+
+  }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
new file mode 100644
index 0000000..9ca44a0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Stack;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A class for parsing MASC's Penn tagging/tokenization stand-off annotation
+ */
+public class MascPennTagParser extends DefaultHandler {
+
+  private Map<Integer, int[]> tokenToQuarks = new HashMap<Integer, int[]>();
+  private Map<Integer, String> tokenToTag = new HashMap<Integer, String>();
+  private Map<Integer, String> tokenToBase = new HashMap<Integer, String>();
+  private Stack<Integer> tokenStack = new Stack();
+  private Stack<Integer> tokenStackTag = new Stack();
+
+  public Map<Integer, String> getTags() {
+    return tokenToTag;
+  }
+
+  public Map<Integer, String> getBases() {
+    return tokenToBase;
+  }
+
+  public Map<Integer, int[]> getTokenToQuarks() {
+    return tokenToQuarks;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName, Attributes attributes)
+      throws SAXException {
+
+    try {
+      //get the link between region and Penn tag
+      if (qName.equals("node")) {
+        tokenStack.push(Integer.parseInt(attributes.getValue("xml:id")
+            .replaceFirst("penn-n", "")));
+      }
+
+      if (qName.equals("link")) {
+        if (tokenStack.isEmpty()) {
+          throw new SAXException("The linking of tokens to quarks is broken.");
+        }
+
+        String[] targets = attributes.getValue("targets")
+            .replaceAll("seg-r", "")
+            .split(" ");
+
+        int[] regions = new int[targets.length];
+        for (int i = 0; i < targets.length; i++) {
+          int region = Integer.parseInt(targets[i]);
+          regions[i] = region;
+        }
+        tokenToQuarks.put(tokenStack.pop(), regions);
+      }
+
+      if (qName.equals("a")) {
+        tokenStackTag.push(Integer.parseInt(attributes.getValue("ref")
+            .replaceFirst("penn-n", "")));
+      }
+
+      if (qName.equals("f")) {
+        String type = attributes.getValue("name");
+        if (tokenStackTag.isEmpty()) {
+          throw new SAXException("The linking of tokens to their tags/bases is broken.");
+        }
+
+        if (type.equals("msd")) {
+          tokenToTag.put(tokenStackTag.peek(), attributes.getValue("value"));
+        } else if (type.equals("base")) {
+          tokenToBase.put(tokenStackTag.peek(), attributes.getValue("value"));
+        }
+      }
+
+    } catch (Exception e) {
+      throw new SAXException("Could not parse the Penn-POS annotation file.\n" + e.getMessage(), e);
+    }
+  }
+
+
+  @Override
+  public void endElement(String uri, String localName, String qName) throws SAXException {
+
+    // we can forget the current node
+    if (qName.equals("a")) {
+      tokenStackTag.pop();
+    }
+
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java
new file mode 100644
index 0000000..0ba4092
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentence.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.util.Span;
+
+public class MascSentence extends Span {
+
+  private class QuarkExtractor {
+
+    private final Map<Integer, MascWord> wordsById;
+    private final List<MascWord> allDocumentWords;
+
+    /**
+     * A helper class to extract the extract a quark from the corpus file even if it is beyond the
+     * bounds of the sentence
+     *
+     * @param wordsById        Quarks of the sentence organized by their id
+     * @param allDocumentWords Quarks of the document organized by their id
+     */
+    protected QuarkExtractor(Map<Integer, MascWord> wordsById, List<MascWord> allDocumentWords) {
+      this.wordsById = wordsById;
+      this.allDocumentWords = allDocumentWords;
+    }
+
+    /**
+     * Extract a quark by its key
+     *
+     * @param key The quark's ID
+     * @return The quark reference
+     * @throws IOException if not found in the document
+     */
+    protected MascWord get(int key) throws IOException {
+      //We first check if this word is in the sentence
+      //todo: evaluate the necessity: HashMaps are O(1), right?
+      if (wordsById.containsKey(key)) {
+        return wordsById.get(key);
+      } else {
+        for (MascWord wordFromWholeDocument : allDocumentWords) {
+          if (wordFromWholeDocument.getId() == key) {
+            return wordFromWholeDocument;
+          }
+        }
+      }
+      throw new IOException("Word" + key + " not found in the document.");
+    }
+
+  }
+
+  private final List<MascWord> allDocumentWords;
+  private final String text;
+  private final List<MascWord> words;
+  private final Map<Integer, MascWord> wordsById;
+  private List<MascToken> sentenceTokens = null;
+  private Map<Integer, Integer> tokensById = new HashMap<>();
+  private List<Span> namedEntities = new ArrayList<>();
+
+  /**
+   * Create a MascSentence, containing its associated text and quarks
+   *
+   * @param s              Start of the sentence within the corpus file
+   * @param e              End of the sentence within the corpus file
+   * @param text           The reference to text of the corpus file
+   * @param sentenceQuarks The quarks found in that sentence
+   * @param allQuarks      The reference to a list of all quarks in the file
+   */
+  public MascSentence(int s, int e, String text, List<MascWord> sentenceQuarks,
+                      List<MascWord> allQuarks) {
+    super(s, e);
+    this.text = text;
+    this.words = sentenceQuarks;
+    this.allDocumentWords = allQuarks;
+
+    // We'll create a map of word ID's and the word ref's to speed up the tokenization
+    HashMap<Integer, MascWord> idToWordMap = new HashMap<>();
+    for (MascWord w : sentenceQuarks) {
+      idToWordMap.put(w.getId(), w);
+    }
+    wordsById = idToWordMap;
+  }
+
+  /**
+   * Add the Penn tokenization and POS tagging to the sentence
+   *
+   * @param tokenToQuarks A map from token ID to quarks in that token
+   * @param quarkToTokens A map of quark IDs and the token IDs containing that quark
+   * @param tokenToBase   Token ID to the token base
+   * @param tokenToTag    Token ID to the POS tag
+   * @return true if no issue encountered, false if tokens cross sentence boundaries
+   * @throws IOException If anything goes wrong
+   */
+  boolean tokenizePenn(Map<Integer, int[]> tokenToQuarks,
+                       Map<Integer, int[]> quarkToTokens,
+                       Map<Integer, String> tokenToBase,
+                       Map<Integer, String> tokenToTag) throws IOException {
+
+    boolean fileWithoutIssues = true;
+    QuarkExtractor extractor = new QuarkExtractor(wordsById, allDocumentWords);
+    sentenceTokens = new ArrayList<>();
+
+    Map<Integer, Boolean> tokensProcessed = new HashMap<>();
+    for (MascWord w : words) {
+      int currentQuarkId = w.getId();
+      //extract the node to which this word belongs
+      int[] tokens = quarkToTokens.get(currentQuarkId);
+
+      //Only continue, if the word belongs to at least one node
+      if (tokens != null) {
+        for (int token : tokens) {
+          //check if we already have the token
+          if (!tokensProcessed.containsKey(token)) {
+
+            int[] quarksOfToken = tokenToQuarks.get(token); // Get the quark IDs contained in the token
+            if (quarksOfToken == null) {
+              System.err.println("Token without quarks found: " + token);
+            }
+
+            for (int quark : quarksOfToken) {
+              if (!wordsById.containsKey(quark)) {
+                fileWithoutIssues = false;
+                System.out.println("[WARNING] Some tokens cross sentence boundaries." +
+                    "\n\tQuark ID: " + quark +
+                    "\n\tPenn token ID: " + token);
+              }
+            }
+
+            /*Because there are some quarks which are parts of tokens outside of a sentence
+            We need to check every time if that quark was actually assigned to the sentence
+            If not, we need to extract it manually from the whole document*/
+            MascWord[] quarks = new MascWord[quarksOfToken.length]; //Get the actual quark references
+            for (int currentQuark = 0; currentQuark < quarks.length; currentQuark++) {
+              quarks[currentQuark] = extractor.get(quarksOfToken[currentQuark]);
+            }
+
+            int start = extractor.get(quarksOfToken[0]).getStart();
+            int end = extractor.get(quarksOfToken[quarksOfToken.length - 1]).getEnd();
+
+            //only insert tokens with non-zero length, apparently some of them exist in the corpus
+            if (end - start > 0) {
+              sentenceTokens.add(new MascToken(start, end, token, tokenToTag.get(token),
+                  tokenToBase.get(token), quarks));
+              tokensProcessed.put(token, true);
+            }
+          }
+        }
+      }
+    }
+    for (int i = 0; i < sentenceTokens.size(); i++) {
+      MascToken t = sentenceTokens.get(i);
+      tokensById.put(t.getTokenId(), i);
+    }
+
+    sentenceTokens = Collections.unmodifiableList(sentenceTokens);
+    return fileWithoutIssues;
+  }
+
+  /**
+   * Add the named entity annotation to the tokenized sentence
+   *
+   * @param entityIDtoEntityType Maps the named entity ID to its type
+   * @param entityIDsToTokens    A list of tokens covered by each named entity
+   * @return true if all went well, false if named entities overlap
+   * @throws IOException if anything goes wrong
+   */
+  boolean addNamedEntities(Map<Integer, String> entityIDtoEntityType,
+                           Map<Integer, List<Integer>> entityIDsToTokens) throws IOException {
+    boolean fileWithoutIssues = true;
+    if (sentenceTokens == null) {
+      throw new IOException("Named entity labels provided for un untokenized sentence.");
+    }
+
+    //for each named entity identify its span
+    for (Map.Entry<Integer, List<Integer>> namedEntity : entityIDsToTokens.entrySet()) {
+
+      int entityID = namedEntity.getKey();
+      String type = entityIDtoEntityType.get(entityID);
+
+      List<Integer> tokenIDs = namedEntity.getValue();
+
+      int start = sentenceTokens.size();
+      int end = 0;
+      boolean entityInThisSentence = false;
+      for (int tokenID : tokenIDs) {
+
+        if (tokensById.containsKey(tokenID)) {
+          entityInThisSentence = true;
+          if (tokensById.get(tokenID) < start) {
+            start = tokensById.get(tokenID);
+          }
+          if (tokensById.get(tokenID) > end) {
+            end = tokensById.get(tokenID) + 1;
+          }
+        }
+      }
+
+      if (entityInThisSentence) {
+        namedEntities.add(new Span(start, end, type));
+      }
+
+    }
+
+    Comparator<Span> compareByStart = Comparator.comparingInt(Span::getStart);
+    namedEntities.sort(compareByStart);
+
+    Set<Integer> overlaps = new HashSet();
+    int leftIndex = 0;
+    int rightIndex = leftIndex + 1;
+    while (rightIndex < namedEntities.size()) {
+      Span leftSpan = namedEntities.get(leftIndex);
+      Span rightSpan = namedEntities.get(rightIndex);
+      if (leftSpan.contains(rightSpan) || leftSpan.crosses(rightSpan)) {
+        System.out.println("[WARNING] Named entities overlap. This is forbidden in the OpenNLP." +
+            "\n\tKeeping the longer of them.");
+        if (rightSpan.length() > leftSpan.length()) {
+          overlaps.add(leftIndex);
+        } else {
+          overlaps.add(rightIndex);
+        }
+        fileWithoutIssues = false;
+        rightIndex++;
+      } else {
+        leftIndex++;
+      }
+    }
+
+    if (!fileWithoutIssues) {
+      List<Span> namedEntitiesNoOverlaps = new ArrayList<>();
+      for (int i = 0; i < namedEntities.size() - 1; i++) {
+        if (!overlaps.contains(i)) {
+          namedEntitiesNoOverlaps.add(namedEntities.get(i));
+        }
+      }
+      namedEntities = Collections.unmodifiableList(namedEntitiesNoOverlaps);
+    }
+
+    return fileWithoutIssues;
+  }
+
+  /**
+   * Get the named entities
+   *
+   * @return List of named entities defined as token span, e.g. Span(1,3, "org") for tokens [1,3)
+   */
+  public List<Span> getNamedEntities() {
+    return namedEntities;
+  }
+
+  /**
+   * Get the sentence text
+   *
+   * @return Text of the sentence as defined by the sentence segmentation annotation.
+   */
+  public String getSentDetectText() {
+    return text.substring(getStart(), getEnd());
+  }
+
+  /**
+   * Get the text of the sentence tokens
+   *
+   * @return Text of the sentence as defined by the tokens in it.
+   */
+  public String getTokenText() {
+    if (sentenceTokens.isEmpty()) {
+      return "";
+    }
+    return text.substring(sentenceTokens.get(0).getStart(),
+        sentenceTokens.get(sentenceTokens.size() - 1).getEnd());
+  }
+
+  /**
+   * Get the text of the sentence tokens
+   *
+   * @return The texts of the individual tokens in the sentence
+   */
+  public List<String> getTokenStrings() {
+    List<String> tokenArray = new ArrayList<>();
+    for (MascToken t : sentenceTokens) {
+      tokenArray.add(text.substring(t.getStart(), t.getEnd()));
+    }
+
+    return Collections.unmodifiableList(tokenArray);
+
+  }
+
+  /**
+   * Get the boundaries of individual tokens
+   *
+   * @return Spans representing the tokens of the sentence (according to Penn tokenization)
+   */
+  public List<Span> getTokensSpans() {
+
+    List<Span> tokenSpans = new ArrayList<>();
+    int offset = sentenceTokens.isEmpty() ? 0 : sentenceTokens.get(0).getStart();
+
+    for (MascToken i : sentenceTokens) {
+      tokenSpans.add(new Span(i.getStart() - offset, i.getEnd() - offset));
+    }
+
+    return Collections.unmodifiableList(tokenSpans);
+  }
+
+  /**
+   * Get the tags of tokens in the sentence
+   *
+   * @return A list of individual tags
+   * @throws IOException if used on an untokenized sentence
+   */
+  public List<String> getTags() throws IOException {
+    List<String> tags = new ArrayList<>();
+    for (MascToken t : sentenceTokens) {
+      tags.add(t.getPos());
+    }
+    return tags;
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java
new file mode 100644
index 0000000..7a679a0
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceParser.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A class to parse the sentence segmentation stand-off annotation
+ */
+class MascSentenceParser extends DefaultHandler {
+
+  private List<Span> sentenceAnchors = null;
+
+  public List<Span> getAnchors() {
+    return sentenceAnchors;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName, Attributes attributes)
+      throws SAXException {
+
+    try {
+      // create a sentence and put it into the list of sentences
+      if (qName.equalsIgnoreCase("region")) {
+        String[] anchors = attributes.getValue("anchors").split(" ");
+
+        int left = Integer.parseInt(anchors[0]);
+        int right = Integer.parseInt(anchors[1]);
+
+        // initialize list
+        if (sentenceAnchors == null) {
+          sentenceAnchors = new ArrayList<Span>();
+        }
+
+        sentenceAnchors.add(new Span(left, right));
+      }
+
+    } catch (Exception e) {
+      throw new SAXException("Could not parse the sentence annotation file.");
+    }
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java
new file mode 100644
index 0000000..7e8a5db
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStream.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascSentenceSampleStream extends FilterObjectStream<MascDocument, SentenceSample> {
+
+  private final int sentencesPerSample;
+  private MascDocument buffer;
+
+  public MascSentenceSampleStream(ObjectStream<MascDocument> samples, int sentencesPerSample)
+      throws IOException {
+    super(samples);
+    this.sentencesPerSample = sentencesPerSample;
+    buffer = samples.read();
+  }
+
+  /**
+   * Reads a new sample of sentences
+   *
+   * @return The specified number of sentences. If fewer left, then return whatever is left.
+   * @throws IOException
+   */
+  @Override
+  public SentenceSample read() throws IOException {
+
+    try {
+      StringBuilder documentText = new StringBuilder();
+      List<Span> sentenceSpans = new ArrayList<>();
+
+      for (int i = 0; i < sentencesPerSample; i++) {
+        MascSentence sentence = buffer.read();
+        if (sentence != null) {
+          // Current document still has sentences
+          int startIndex = documentText.length();
+          documentText.append(sentence.getSentDetectText()).append(' ');
+          sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+        } else if ((buffer = samples.read()) != null) {
+          documentText.append('\n');
+          // Current document exhausted, but we can still move on to the next one
+          i--; // This round does not count
+        } else {
+          // We exhausted all sentences in all documents
+          break;
+        }
+      }
+
+      if (documentText.length() > 0) {
+        documentText.setLength(documentText.length() - 1);
+        return new SentenceSample(documentText,
+            sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+      }
+
+      return null;
+    } catch (IOException e) {
+      throw new IOException("You are reading an empty document stream. " +
+          "Did you close it?");
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    samples.close();
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    samples.reset();
+    buffer = samples.read();
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..a445167
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascSentenceSampleStreamFactory.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  public static final String MASC_FORMAT = "masc";
+
+  protected <P> MascSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        MASC_FORMAT,
+        new opennlp.tools.formats.masc.MascSentenceSampleStreamFactory(
+            opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+    opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters params =
+        ArgumentParser.parse(args,
+            opennlp.tools.formats.masc.MascSentenceSampleStreamFactory.Parameters.class);
+
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+      return new MascSentenceSampleStream(
+          new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter),
+          Integer.parseInt(params.getSentencesPerSample()));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+
+    @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+        description = "search through files recursively")
+    boolean getRecurrentSearch();
+
+    @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+        description = "only include files which contain a given string in their name")
+    String getFileFilter();
+
+  }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java
new file mode 100644
index 0000000..5d17a40
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascToken.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import opennlp.tools.util.Span;
+
+public class MascToken extends Span {
+
+  private final String pos;
+  private final String base;
+  private final int tokenId;
+  private final MascWord[] quarks;
+
+  /**
+   * Create a MascToken, which may combine multiple quarks
+   *
+   * @param s      The start of the token in the corpus file
+   * @param e      The end of the token in the corpus file
+   * @param pennId The ID of the token as assigned by the Penn stand-off annotation
+   * @param pos    The POS-tag
+   * @param base   The base form
+   * @param quarks Quarks contained in the token
+   */
+  public MascToken(int s, int e, int pennId, String pos, String base, MascWord[] quarks) {
+    super(s, e);
+    this.pos = pos;
+    this.base = base;
+    this.tokenId = pennId;
+    this.quarks = quarks;
+  }
+
+  /**
+   * Get ID of the token
+   *
+   * @return the ID
+   */
+  public int getTokenId() {
+    return tokenId;
+  }
+
+  /**
+   * Get the base form
+   *
+   * @return the base form
+   */
+  public String getBase() {
+    return base;
+  }
+
+  /**
+   * Get the POS tag
+   *
+   * @return POS tag
+   */
+  public String getPos() {
+    return pos;
+  }
+
+  /**
+   * Get quarks of the token
+   *
+   * @return Array of quark references
+   */
+  public MascWord[] getQuarks() {
+    return quarks;
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java
new file mode 100644
index 0000000..93fd21d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStream.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.IOException;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class MascTokenSampleStream extends FilterObjectStream<MascDocument, TokenSample> {
+
+  MascDocument buffer;
+
+  public MascTokenSampleStream(ObjectStream<MascDocument> samples) throws IOException {
+    super(samples);
+    try {
+      do {
+        buffer = samples.read();
+      } while (!buffer.hasPennTags()); // For now, we only use Penn tokenization
+    } catch (Exception e) {
+      throw new IOException("None of the documents has Penn tokenization" +
+          e.getMessage());
+    }
+  }
+
+  public TokenSample read() throws IOException {
+
+    /* Read the documents one sentence at a time
+    If the document is over, move to the next one
+    If both document stream and sentence stream are over, return null
+     */
+    try {
+      boolean sentenceFound = true;
+      String sentenceString;
+      List<Span> tokensSpans;
+      MascSentence sentence;
+      do {
+        sentence = buffer.read();
+        while (sentence == null) {
+          buffer = samples.read();
+          if (buffer == null) {
+            return null;
+          }
+          if (buffer.hasPennTags()) {
+            sentence = buffer.read();
+          }
+        }
+
+        sentenceString = sentence.getTokenText();
+        tokensSpans = sentence.getTokensSpans();
+
+        if (sentenceString.length() == 0) {
+          System.err.println("[WARNING] Zero sentence found: " +
+              "there is a sentence without any tokens.");
+          System.err.println(sentenceString);
+          System.err.println(tokensSpans.toString());
+          sentenceFound = false;
+        }
+
+        for (int i = 0; i < tokensSpans.size(); i++) {
+          Span t = tokensSpans.get(i);
+          if (t.getEnd() - t.getStart() == 0) {
+            System.err.println("[WARNING] Zero token found: " +
+                "there is a token without any quarks.");
+            System.err.println(sentenceString);
+            System.err.println(tokensSpans.toString());
+            sentenceFound = false;
+          }
+        }
+
+
+      } while (!sentenceFound);
+
+      Span[] tokensSpansArray = new Span[tokensSpans.size()];
+      tokensSpans.toArray(tokensSpansArray);
+
+      return new TokenSample(sentenceString, tokensSpansArray);
+
+    } catch (IOException e) {
+      throw new IOException("Could not get a sample of tokens from the data.");
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    samples.close();
+  }
+
+  @Override
+  public void reset() throws IOException, UnsupportedOperationException {
+    samples.reset();
+    buffer = samples.read();
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java
new file mode 100644
index 0000000..655be15
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascTokenSampleStreamFactory.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.FileFilter;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+public class MascTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> {
+
+  public static final String MASC_FORMAT = "masc";
+
+
+  protected <P> MascTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        MASC_FORMAT,
+        new opennlp.tools.formats.masc.MascTokenSampleStreamFactory(
+            opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters.class));
+  }
+
+
+  @Override
+  public ObjectStream<TokenSample> create(String[] args) {
+    opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters params =
+        ArgumentParser.parse(args,
+            opennlp.tools.formats.masc.MascTokenSampleStreamFactory.Parameters.class);
+
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains(params.getFileFilter());
+
+      return new MascTokenSampleStream(
+          new MascDocumentStream(params.getData(), params.getRecurrentSearch(), fileFilter));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+
+    @ArgumentParser.ParameterDescription(valueName = "recurrentSearch",
+        description = "search through files recursively")
+    boolean getRecurrentSearch();
+
+    @ArgumentParser.ParameterDescription(valueName = "fileFilterString",
+        description = "only include files which contain a given string in their name")
+    String getFileFilter();
+
+  }
+
+}
+
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java
new file mode 100644
index 0000000..a75dce7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWord.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import opennlp.tools.util.Span;
+
+public class MascWord extends Span {
+
+  private final int id;
+
+  /**
+   * Saves one of MASC's quarks - basic-level units (may be sub-word)
+   *
+   * @param s  The beginning of the word in the corpus file
+   * @param e  The end of the word in the corpus file
+   * @param id The id as assigned by the stand-off annotation
+   */
+  public MascWord(int s, int e, int id) {
+    super(s, e);
+    this.id = id;
+  }
+
+  public int getId() {
+    return id;
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java
new file mode 100644
index 0000000..db57f82
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascWordParser.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class to parse the word ("quark") segmentation stand-off annotation
+ */
+class MascWordParser extends DefaultHandler {
+
+  private List<MascWord> wordAnchors = null;
+
+  public List<MascWord> getAnchors() {
+    return wordAnchors;
+  }
+
+  @Override
+  public void startElement(String uri, String localName, String qName, Attributes attributes)
+      throws SAXException {
+
+    try {
+      // create a word and put it into the list of words
+      if (qName.equalsIgnoreCase("region")) {
+        int id = Integer.parseInt(attributes.getValue("xml:id").replaceFirst("seg-r", ""));
+        String[] anchors = attributes.getValue("anchors").split(" ");
+
+        int left = Integer.parseInt(anchors[0]);
+        int right = Integer.parseInt(anchors[1]);
+
+        // initialize list
+        if (wordAnchors == null) {
+          wordAnchors = new ArrayList<MascWord>();
+        }
+
+        wordAnchors.add(new MascWord(left, right, id));
+      }
+
+    } catch (Exception e) {
+      throw new SAXException("Could not parse the word segmentation annotation file.");
+    }
+  }
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java
new file mode 100644
index 0000000..2f11150
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascNamedEntitySampleStreamTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderEvaluator;
+import opennlp.tools.namefind.TokenNameFinderFactory;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class MascNamedEntitySampleStreamTest {
+
+  @Test
+  public void read() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascNamedEntitySampleStream stream;
+      stream = new MascNamedEntitySampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      NameSample s = stream.read();
+
+      String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      Span[] expectedTags = new Span[] {new Span(4, 5, "org")};
+      Span[] returnedTags = s.getNames();
+      // check the start/end positions
+      assertEquals(expectedTags.length, returnedTags.length);
+      for (int i = 0; i < returnedTags.length; i++) {
+        assertTrue(expectedTags[i].equals(returnedTags[i]));
+      }
+
+      s = stream.read();
+      expectedTokens = new String[] {"This", "is", "'nother", "test", "sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      expectedTags = new Span[] {};
+      returnedTags = s.getNames();
+      assertArrayEquals(expectedTags, returnedTags);
+
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void close() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascNamedEntitySampleStream stream;
+      stream = new MascNamedEntitySampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      stream.close();
+      NameSample s = stream.read();
+    } catch (IOException e) {
+      assertEquals(e.getMessage(),
+          "You are reading an empty document stream. " +
+              "Did you close it?");
+    }
+  }
+
+  @Test
+  public void reset() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascNamedEntitySampleStream stream;
+      stream = new MascNamedEntitySampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      NameSample s = stream.read();
+      s = stream.read();
+      s = stream.read();
+      assertNull(s);  //The stream should be exhausted by now
+
+      stream.reset();
+
+      s = stream.read();
+      String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      Span[] expectedTags = new Span[] {new Span(4, 5, "org")};
+      Span[] returnedTags = s.getNames();
+      // check the start/end positions
+      assertEquals(expectedTags.length, returnedTags.length);
+      for (int i = 0; i < returnedTags.length; i++) {
+        assertTrue(expectedTags[i].equals(returnedTags[i]));
+      }
+
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void train() {
+    try {
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      FileFilter fileFilter = pathname -> pathname.getName().contains("");
+      ObjectStream<NameSample> trainSample = new MascNamedEntitySampleStream(
+          new MascDocumentStream(directory,
+              true, fileFilter));
+
+      System.out.println("Training");
+      TokenNameFinderModel model = null;
+      TrainingParameters trainingParameters = new TrainingParameters();
+      trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 100);
+
+      model = NameFinderME.train("en", null, trainSample,
+          trainingParameters, new TokenNameFinderFactory());
+
+      ObjectStream<NameSample> testNames = new MascNamedEntitySampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+      TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
+      evaluator.evaluate(testNames);
+
+      System.out.println(evaluator.getFMeasure());
+
+    } catch (Exception e) {
+      System.err.println(e.getMessage());
+      StackTraceElement[] traces = e.getStackTrace();
+      for (StackTraceElement trace : traces) {
+        System.err.println(trace.toString());
+      }
+      fail("Exception raised");
+    }
+  }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java
new file mode 100644
index 0000000..4eba1c7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascPOSSampleStreamTest.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import opennlp.tools.postag.POSEvaluator;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascPOSSampleStreamTest {
+
+  @Test
+  public void read() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascPOSSampleStream stream;
+      stream = new MascPOSSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      POSSample s = stream.read();
+
+      String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      String[] expectedTags = {"DT", "VB", "AT", "NN", "NN", "."};
+      assertArrayEquals(expectedTags, s.getTags());
+
+      s = stream.read();
+      expectedTokens = new String[] {"This", "is", "'nother", "test", "sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      expectedTags = new String[] {"DT", "VB", "RB", "NN", "NN", "."};
+      assertArrayEquals(expectedTags, s.getTags());
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void close() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascPOSSampleStream stream;
+      stream = new MascPOSSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      stream.close();
+      POSSample s = stream.read();
+    } catch (IOException e) {
+      assertEquals(e.getMessage(),
+          "You are reading an empty document stream. " +
+              "Did you close it?");
+    }
+  }
+
+  @Test
+  public void reset() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascPOSSampleStream stream;
+      stream = new MascPOSSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      POSSample s = stream.read();
+      s = stream.read();
+      s = stream.read();
+      assertNull(s);  //The stream should be exhausted by now
+
+      stream.reset();
+
+      s = stream.read();
+
+      String[] expectedTokens = {"This", "is", "a", "test", "Sentence", "."};
+      assertArrayEquals(expectedTokens, s.getSentence());
+
+      String[] expectedTags = {"DT", "VB", "AT", "NN", "NN", "."};
+      assertArrayEquals(expectedTags, s.getTags());
+
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void train() {
+    try {
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      FileFilter fileFilter = pathname -> pathname.getName().contains("");
+      ObjectStream<POSSample> trainPOS = new MascPOSSampleStream(
+          new MascDocumentStream(directory,
+              true, fileFilter));
+
+      System.out.println("Training");
+      POSModel model = null;
+      TrainingParameters trainingParameters = new TrainingParameters();
+      trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+      model = POSTaggerME.train("en", trainPOS,
+          trainingParameters, new POSTaggerFactory());
+
+      ObjectStream<POSSample> testPOS = new MascPOSSampleStream(new MascDocumentStream(directory,
+          true, fileFilter));
+      POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
+      evaluator.evaluate(testPOS);
+      System.out.println("Accuracy: " + evaluator.getWordAccuracy());
+      System.out.println("Words: " + evaluator.getWordCount());
+
+    } catch (Exception e) {
+      System.err.println(e.getMessage());
+      System.err.println(Arrays.toString(e.getStackTrace()));
+      fail("Exception raised");
+    }
+
+
+  }
+
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java
new file mode 100644
index 0000000..6298273
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascSentenceSampleStreamTest.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import opennlp.tools.sentdetect.SentenceDetectorEvaluator;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascSentenceSampleStreamTest {
+
+  @Test
+  public void reset() {
+    FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+    File directory = new File(this.getClass().getResource(
+        "/opennlp/tools/formats/masc/").getFile());
+    try {
+      MascSentenceSampleStream stream = new MascSentenceSampleStream(
+          new MascDocumentStream(directory, true, fileFilter), 2);
+
+      //exhaust the fake file
+      SentenceSample testSample = stream.read();
+
+      //now we should get null
+      testSample = stream.read();
+      assertNull(testSample);
+
+      //by resetting, we should get good results again
+      stream.reset();
+      testSample = stream.read();
+      assertNotNull(testSample);
+
+      String documentText = "This is a test Sentence. This is 'nother test sentence. ";
+      List<Span> sentenceSpans = new ArrayList<>();
+      sentenceSpans.add(new Span(0, 24));
+      sentenceSpans.add(new Span(25, 55));
+      SentenceSample expectedSample = new SentenceSample(documentText,
+          sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+
+      assertEquals(testSample.toString(), expectedSample.toString());
+
+    } catch (IOException e) {
+      fail("IO Exception");
+    }
+  }
+
+  @Test
+  public void close() {
+
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascSentenceSampleStream stream;
+      stream = new MascSentenceSampleStream(
+          new MascDocumentStream(directory, true, fileFilter), 2);
+      stream.close();
+      stream.read();
+    } catch (IOException e) {
+      assertEquals(e.getMessage(),
+          "You are reading an empty document stream. " +
+              "Did you close it?");
+    }
+  }
+
+  @Test
+  public void read() {
+    FileFilter fileFilter = pathname -> pathname.getName().contains("");
+    File directory = new File(this.getClass().getResource("/opennlp/tools/formats/masc").getFile());
+    try {
+      MascSentenceSampleStream stream = new MascSentenceSampleStream(
+          new MascDocumentStream(directory, true, fileFilter), 2);
+
+      String documentText = "This is a test Sentence. This is 'nother test sentence. ";
+      List<Span> sentenceSpans = new ArrayList<>();
+      sentenceSpans.add(new Span(0, 24));
+      sentenceSpans.add(new Span(25, 55));
+
+      SentenceSample expectedSample = new SentenceSample(documentText,
+          sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+      SentenceSample testSample = stream.read();
+      assertEquals(testSample.toString(), expectedSample.toString());
+
+      //the fake file is exhausted, we should get null now
+      testSample = stream.read();
+      assertNull(testSample);
+
+    } catch (IOException e) {
+      System.out.println(e.getMessage());
+      System.out.println(Arrays.toString(e.getStackTrace()));
+      fail("IO Exception");
+    }
+
+  }
+
+  @Ignore //todo: We can't train on the FakeMasc data, it is too small.
+  @Test
+  public void train() {
+    try {
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      FileFilter fileFilter = pathname -> pathname.getName().contains("");
+      ObjectStream<SentenceSample> trainSentences = new MascSentenceSampleStream(
+          new MascDocumentStream(directory,
+              true, fileFilter), 1);
+
+      System.out.println("Training");
+      SentenceModel model = null;
+      TrainingParameters trainingParameters = new TrainingParameters();
+      trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+      model = SentenceDetectorME.train("en", trainSentences,
+          new SentenceDetectorFactory(), trainingParameters);
+
+      ObjectStream<SentenceSample> testPOS = new MascSentenceSampleStream(
+          new MascDocumentStream(directory, true, fileFilter), 1);
+      SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(
+          new SentenceDetectorME(model));
+      evaluator.evaluate(testPOS);
+      System.out.println(evaluator.getFMeasure());
+
+    } catch (Exception e) {
+      System.err.println(e.getMessage());
+      System.err.println(Arrays.toString(e.getStackTrace()));
+      fail("Exception raised");
+    }
+
+
+  }
+
+}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java
new file mode 100644
index 0000000..ec2fbe1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/masc/MascTokenSampleStreamTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.masc;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.tokenize.TokenizerEvaluator;
+import opennlp.tools.tokenize.TokenizerFactory;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+public class MascTokenSampleStreamTest {
+
+  @Test
+  public void read() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascTokenSampleStream stream;
+      stream = new MascTokenSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      TokenSample s = stream.read();
+
+      String expectedString = "This is a test Sentence.";
+      assertEquals(expectedString, s.getText());
+
+      Span[] expectedTags = {
+          new Span(0, 4),
+          new Span(5, 7),
+          new Span(8, 9),
+          new Span(10, 14),
+          new Span(15, 23),
+          new Span(23, 24)};
+      assertArrayEquals(expectedTags, s.getTokenSpans());
+
+      s = stream.read();
+      String expectedTokens = "This is 'nother test sentence.";
+      assertEquals(expectedTokens, s.getText());
+
+      expectedTags = new Span[] {
+          new Span(0, 4),
+          new Span(5, 7),
+          new Span(8, 15),
+          new Span(16, 20),
+          new Span(21, 29),
+          new Span(29, 30)};
+      assertArrayEquals(expectedTags, s.getTokenSpans());
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+  @Test
+  public void close() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascTokenSampleStream stream;
+      stream = new MascTokenSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      stream.close();
+      TokenSample s = stream.read();
+    } catch (IOException e) {
+      assertEquals(e.getMessage(),
+          "You are reading an empty document stream. " +
+              "Did you close it?");
+    }
+  }
+
+  @Test
+  public void reset() {
+    try {
+      FileFilter fileFilter = pathname -> pathname.getName().contains("MASC");
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      MascTokenSampleStream stream;
+      stream = new MascTokenSampleStream(
+          new MascDocumentStream(directory, true, fileFilter));
+
+      TokenSample s = stream.read();
+      s = stream.read();
+      s = stream.read();
+      assertNull(s);  //The stream should be exhausted by now
+
+      stream.reset();
+
+      s = stream.read();
+
+      String expectedString = "This is a test Sentence.";
+      assertEquals(expectedString, s.getText());
+
+      Span[] expectedTags = {
+          new Span(0, 4),
+          new Span(5, 7),
+          new Span(8, 9),
+          new Span(10, 14),
+          new Span(15, 23),
+          new Span(23, 24)};
+      assertArrayEquals(expectedTags, s.getTokenSpans());
+
+    } catch (IOException e) {
+      fail("IO Exception: " + e.getMessage());
+    }
+  }
+
+
+  @Test
+  public void train() {
+    try {
+      File directory = new File(this.getClass().getResource(
+          "/opennlp/tools/formats/masc/").getFile());
+      FileFilter fileFilter = pathname -> pathname.getName().contains("");
+      ObjectStream<TokenSample> trainTokens = new MascTokenSampleStream(
+          new MascDocumentStream(directory,
+              true, fileFilter));
+
+      System.out.println("Training");
+      TokenizerModel model = null;
+      TrainingParameters trainingParameters = new TrainingParameters();
+      trainingParameters.put(TrainingParameters.ITERATIONS_PARAM, 20);
+
+      model = TokenizerME.train(trainTokens, new TokenizerFactory("en", null, false, null),
+          trainingParameters);
+
+      ObjectStream<TokenSample> testTokens = new MascTokenSampleStream(
+          new MascDocumentStream(directory,
+              true, fileFilter));
+      TokenizerEvaluator evaluator = new TokenizerEvaluator(new TokenizerME(model));
+      evaluator.evaluate(testTokens);
+      System.out.println(evaluator.getFMeasure());
+
+    } catch (Exception e) {
+      System.err.println(e.getMessage());
+      System.err.println(Arrays.toString(e.getStackTrace()));
+      fail("Exception raised");
+    }
+
+
+  }
+
+
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml
new file mode 100644
index 0000000..3a6caf3
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-ne.xml
@@ -0,0 +1,20 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+    <graphHeader>
+        <labelsDecl>
+            <labelUsage label="person" occurs="1"/>
+        </labelsDecl>
+        <dependencies>
+            <dependsOn f.id="f.penn"/>
+        </dependencies>
+        <annotationSpaces>
+            <annotationSpace as.id="anc"/>
+        </annotationSpaces>
+    </graphHeader>
+    <node xml:id="ne-n0"/>
+    <a xml:id="ne-N65579" label="org" ref="ne-n0" as="anc">
+        <fs>
+            <f name="type" value="person"/>
+        </fs>
+    </a>
+    <edge xml:id="ne-lnk1" from="ne-n0" to="penn-n4"/>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml
new file mode 100644
index 0000000..2be448a
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-penn.xml
@@ -0,0 +1,145 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+    <graphHeader>
+        <labelsDecl>
+            <labelUsage label="tok" occurs="12"/>
+        </labelsDecl>
+        <dependencies>
+            <dependsOn f.id="f.seg"/>
+        </dependencies>
+        <annotationSpaces>
+            <annotationSpace as.id="anc"/>
+        </annotationSpaces>
+    </graphHeader>
+
+    <node xml:id="penn-n0">
+        <link targets="seg-r0"/>
+    </node>
+    <a xml:id="penn-N65571" label="tok" ref="penn-n0" as="anc">
+        <fs>
+            <f name="base" value="this"/>
+            <f name="msd" value="DT"/>
+            <f name="string" value="This"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n1">
+        <link targets="seg-r2"/>
+    </node>
+    <a xml:id="penn-N65599" label="tok" ref="penn-n1" as="anc">
+        <fs>
+            <f name="base" value="is"/>
+            <f name="msd" value="VB"/>
+            <f name="string" value="is"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n10">
+        <link targets="seg-r20"/>
+    </node>
+    <a xml:id="penn-N65847" label="tok" ref="penn-n10" as="anc">
+        <fs>
+            <f name="base" value="sentence"/>
+            <f name="msd" value="NN"/>
+            <f name="string" value="sentence"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n11">
+        <link targets="seg-r21"/>
+    </node>
+    <a xml:id="penn-N65875" label="tok" ref="penn-n11" as="anc">
+        <fs>
+            <f name="base" value="."/>
+            <f name="msd" value="."/>
+            <f name="string" value="."/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n2">
+        <link targets="seg-r4"/>
+    </node>
+    <a xml:id="penn-N65627" label="tok" ref="penn-n2" as="anc">
+        <fs>
+            <f name="base" value="a"/>
+            <f name="msd" value="AT"/>
+            <f name="string" value="a"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n3">
+        <link targets="seg-r6"/>
+    </node>
+    <a xml:id="penn-N65655" label="tok" ref="penn-n3" as="anc">
+        <fs>
+            <f name="base" value="sample"/>
+            <f name="msd" value="NN"/>
+            <f name="string" value="sample"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n4">
+        <link targets="seg-r8"/>
+    </node>
+    <a xml:id="penn-N65683" label="tok" ref="penn-n4" as="anc">
+        <fs>
+            <f name="base" value="sentence"/>
+            <f name="msd" value="NN"/>
+            <f name="string" value="Sentence"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n5">
+        <link targets="seg-r9"/>
+    </node>
+    <a xml:id="penn-N65711" label="tok" ref="penn-n5" as="anc">
+        <fs>
+            <f name="string" value="."/>
+            <f name="msd" value="."/>
+            <f name="base" value="."/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n6">
+        <link targets="seg-r11"/>
+    </node>
+    <a xml:id="penn-N65739" label="tok" ref="penn-n6" as="anc">
+        <fs>
+            <f name="base" value="this"/>
+            <f name="msd" value="DT"/>
+            <f name="string" value="This"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n7">
+        <link targets="seg-r13"/>
+    </node>
+    <a xml:id="penn-N65767" label="tok" ref="penn-n7" as="anc">
+        <fs>
+            <f name="base" value="is"/>
+            <f name="msd" value="VB"/>
+            <f name="string" value="is"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n8">
+        <link targets="seg-r15 seg-r16"/>
+    </node>
+    <a xml:id="penn-N65795" label="tok" ref="penn-n8" as="anc">
+        <fs>
+            <f name="base" value="'nother"/>
+            <f name="msd" value="RB"/>
+            <f name="string" value="'nother"/>
+        </fs>
+    </a>
+
+    <node xml:id="penn-n9">
+        <link targets="seg-r18"/>
+    </node>
+    <a xml:id="penn-N65819" label="tok" ref="penn-n9" as="anc">
+        <fs>
+            <f name="base" value="sample"/>
+            <f name="msd" value="NN"/>
+            <f name="string" value="sample"/>
+        </fs>
+    </a>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml
new file mode 100644
index 0000000..bb124dc
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-s.xml
@@ -0,0 +1,29 @@
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+    <graphHeader>
+        <labelsDecl>
+            <labelUsage label="s" occurs="2"/>
+        </labelsDecl>
+        <annotationSpaces>
+            <annotationSpace as.id="anc"/>
+        </annotationSpaces>
+    </graphHeader>
+    <region xml:id="s-r0" anchors="1 25"/>
+    <region xml:id="s-r1" anchors="27 57"/>
+
+    <node xml:id="s-n0">
+        <link targets="s-r0"/>
+    </node>
+    <a xml:id="s-N65697" label="s" ref="s-n0" as="anc">
+        <fs>
+            <f name="id" value="s0.1"/>
+        </fs>
+    </a>
+    <node xml:id="s-n1">
+        <link targets="s-r1"/>
+    </node>
+    <a xml:id="s-N65717" label="s" ref="s-n1" as="anc">
+        <fs>
+            <f name="id" value="p1s1"/>
+        </fs>
+    </a>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml
new file mode 100644
index 0000000..a5f33fa
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC-seg.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<graph xmlns="http://www.xces.org/ns/GrAF/1.0/">
+    <graphHeader>
+        <labelsDecl>
+        </labelsDecl>
+    </graphHeader>
+    <region xml:id="seg-r0" anchors="1 5"/>
+    <region xml:id="seg-r2" anchors="6 8"/>
+    <region xml:id="seg-r4" anchors="9 10"/>
+    <region xml:id="seg-r6" anchors="11 15"/>
+    <region xml:id="seg-r8" anchors="16 24"/>
+    <region xml:id="seg-r9" anchors="24 25"/>
+    <region xml:id="seg-r11" anchors="27 31"/>
+    <region xml:id="seg-r13" anchors="32 34"/>
+    <region xml:id="seg-r15" anchors="35 36"/>
+    <region xml:id="seg-r16" anchors="36 42"/>
+    <region xml:id="seg-r18" anchors="43 47"/>
+    <region xml:id="seg-r20" anchors="48 56"/>
+    <region xml:id="seg-r21" anchors="56 57"/>
+</graph>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr
new file mode 100644
index 0000000..31778d2
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.hdr
@@ -0,0 +1,37 @@
+<documentHeader xmlns="http://www.xces.org/ns/GrAF/1.0/" docId="MASCF-00046" creator="JZ" date.created="2019-08-27"
+                version="1.0.4">
+    <fileDesc>
+        <titleStmt>
+            <title>FakeMASC</title>
+        </titleStmt>
+        <extent count="10" unit="word"/>
+        <sourceDesc>
+            <title>A fake MASC file to test the OpenNLP MASC-Format tools</title>
+            <publisher>None</publisher>
+            <eAddress type="web">http://github.com/</eAddress>
+            <pubPlace>http://github.com/apache/opennlp</pubPlace>
+        </sourceDesc>
+    </fileDesc>
+    <profileDesc>
+        <textClass catRef="SP TR ">
+            <domain>None</domain>
+            <subdomain>None at all</subdomain>
+            <subject>Unit test</subject>
+            <audience>Adult</audience>
+        </textClass>
+        <primaryData f.id="f.text" loc="fakeMASC.txt"/>
+        <annotations>
+            <annotation loc="fakeMASC-s.xml" f.id="f.s">sentence boundaries</annotation>
+            <annotation loc="fakeMASC-seg.xml" f.id="f.seg">word segments: quarks</annotation>
+            <annotation loc="fakeMASC-penn.xml" f.id="f.penn">Penn POS tags</annotation>
+            <annotation loc="fakeMASC-ne.xml" f.id="f.ne">Named entity annotation</annotation>
+        </annotations>
+    </profileDesc>
+    <revisionDesc>
+        <change>
+            <changeDate>2019-08-27</changeDate>
+            <respName>Jiri Zamecnik</respName>
+            <item>Created the fake file to test the standoff markup</item>
+        </change>
+    </revisionDesc>
+</documentHeader>
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt
new file mode 100644
index 0000000..0428dc4
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/masc/fakeMASC.txt
@@ -0,0 +1 @@
+ This is a test Sentence.  This is 'nother test sentence.
\ No newline at end of file