You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/11/09 21:10:58 UTC

[04/16] opennlp git commit: OPENNLP-622 Refactored to remove usage of main methods of Morfologik.

OPENNLP-622 Refactored to remove usage of main methods of Morfologik.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1314887f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1314887f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1314887f

Branch: refs/heads/trunk
Commit: 1314887fe657f21e1213788fd6084a485781f2f1
Parents: 15c3fb7
Author: William Colen <co...@apache.org>
Authored: Thu Jul 7 05:19:18 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 7 05:19:18 2016 +0000

----------------------------------------------------------------------
 .../builder/MorfologikDictionayBuilder.java     | 144 ++++++-------------
 .../MorfologikDictionaryBuilderParams.java      |  37 +++--
 .../MorfologikDictionaryBuilderTool.java        |  17 +--
 .../lemmatizer/MorfologikLemmatizer.java        |   8 +-
 .../tagdict/MorfologikPOSTaggerFactory.java     |  14 +-
 .../builder/POSDictionayBuilderTest.java        |  67 +++------
 .../lemmatizer/MorfologikLemmatizerTest.java    |  17 +--
 .../tagdict/MorfologikTagDictionaryTest.java    |  18 +--
 .../tagdict/POSTaggerFactoryTest.java           | 108 ++++++++++++++
 src/test/resources/AnnotatedSentences.txt       | 136 ++++++++++++++++++
 src/test/resources/dictionaryWithLemma.info     |  15 ++
 src/test/resources/dictionaryWithLemma.txt      |  21 +--
 12 files changed, 386 insertions(+), 216 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
index 0131318..dbbca4d 100644
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -17,21 +17,15 @@
 
 package opennlp.morfologik.builder;
 
-import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
 import java.util.Properties;
 
 import morfologik.stemming.DictionaryMetadata;
 import morfologik.stemming.EncoderType;
-import morfologik.tools.FSACompile;
-import morfologik.tools.Launcher;
+import morfologik.tools.DictCompile;
 
 /**
  * Utility class to build Morfologik dictionaries from a tab separated values
@@ -41,117 +35,69 @@ import morfologik.tools.Launcher;
 public class MorfologikDictionayBuilder {
 
   /**
-   * Build a Morfologik binary dictionary
-   *
-   * @param dictInFile
-   *          the 3 column TSV dictionary file
-   * @param dictOutFile
-   *          where to store the binary Morfologik dictionary
-   * @param encoding
-   *          the encoding to be used while reading and writing
-   * @param separator
-   *          a field separator, the default is '+'. If your tags contains '+'
-   *          change to something else
-   * @param encoderType
-   *          the Morfologik enconder type
-   * @param isUseInfixes
-   *          if to compact using infixes
+   * Helper to compile a morphological dictionary automaton.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   * @param overwrite
+   *          Overwrite the output file if it exists.
+   * @param validate
+   *          Validate input to make sure it makes sense.
+   * @param acceptBom
+   *          Accept leading BOM bytes (UTF-8).
+   * @param acceptCr
+   *          Accept CR bytes in input sequences (\r).
+   * @param ignoreEmpty
+   *          Ignore empty lines in the input.
+   * @return the dictionary path
+   * 
    * @throws Exception
    */
-  public void build(File dictInFile, File dictOutFile, Charset encoding,
-      String separator, EncoderType encoderType)
+  public Path build(Path input, boolean overwrite, boolean validate,
+      boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
       throws Exception {
-    Path propertiesPath = DictionaryMetadata
-        .getExpectedMetadataLocation(dictOutFile.toPath()); 
+
+    DictCompile compiler = new DictCompile(input, overwrite, validate,
+        acceptBom, acceptCr, ignoreEmpty);
+    compiler.call();
+
+    
+    Path metadataPath = DictionaryMetadata
+        .getExpectedMetadataLocation(input);
     
-    this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator,
-        encoderType);
+    return metadataPath.resolveSibling(
+        metadataPath.getFileName().toString().replaceAll(
+            "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
   }
 
   /**
-   * Build a Morfologik binary dictionary
-   *
-   * @param dictInFile
-   *          the 3 column TSV dictionary file
-   * @param dictOutFile
-   *          where to store the binary Morfologik dictionary
-   * @param propertiesOutFile
-   *          where to store the properties of the Morfologik dictionary
-   * @param encoding
-   *          the encoding to be used while reading and writing
-   * @param separator
-   *          a field separator, the default is '+'. If your tags contains '+'
-   *          change to something else
-   * @param isUsePrefixes
-   *          if to compact using prefixes
-   * @param isUseInfixes
-   *          if to compact using infixes
+   * Helper to compile a morphological dictionary automaton using default
+   * parameters.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   *          
+   *  @return the dictionary path
+   * 
    * @throws Exception
    */
-  public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
-      Charset encoding, String separator, EncoderType encoderType) throws Exception {
-
-    // we need to execute tab2morph followed by fsa_build
-
-    File morph = tab2morph(dictInFile, separator, encoderType);
+  public Path build(Path input) throws Exception {
 
-    fsaBuild(morph, dictOutFile);
+    return build(input, true, true, false, false, false);
 
-    morph.delete();
-
-    // now we create the properties files using the passed parameters
-    createProperties(encoding, separator, encoderType,
-        propertiesOutFile);
   }
 
-  void createProperties(Charset encoding, String separator,
-		  EncoderType encoderType, File propertiesFile)
-      throws FileNotFoundException, IOException {
+  Properties createProperties(Charset encoding, String separator,
+      EncoderType encoderType) throws FileNotFoundException, IOException {
 
     Properties properties = new Properties();
     properties.setProperty("fsa.dict.separator", separator);
     properties.setProperty("fsa.dict.encoding", encoding.name());
     properties.setProperty("fsa.dict.encoder", encoderType.name());
 
-    OutputStream os = new FileOutputStream(propertiesFile);
-    properties.store(os, "Morfologik POS Dictionary properties");
-    os.close();
-
-  }
+    return properties;
 
-  private void fsaBuild(File morph, File dictOutFile) throws Exception {
-    String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
-        dictOutFile.getAbsolutePath() };
-    FSACompile.main(params);
-    // FSABuildTool.main(params);
   }
-
-  private File tab2morph(File dictInFile, String separator,
-      EncoderType encoderType) throws Exception {
-
-    // create tab2morph parameters
-    List<String> tag2morphParams = new ArrayList<String>();
-    tag2morphParams.add("tab2morph");
-
-    tag2morphParams.add("--annotation");
-    tag2morphParams.add(separator);
-    
-    tag2morphParams.add("--e");
-    tag2morphParams.add(encoderType.name());
-
-    tag2morphParams.add("-i");
-    tag2morphParams.add(dictInFile.getAbsolutePath());
-
-    // we need a temporary file to store the intermediate output
-    File tmp = File.createTempFile("tab2morph", ".txt");
-    tmp.deleteOnExit();
-
-    tag2morphParams.add("-o");
-    tag2morphParams.add(tmp.getAbsolutePath());
-
-    Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
-
-    return tmp;
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
index 193599b..5ea2e4f 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -19,7 +19,6 @@ package opennlp.morfologik.cmdline.builder;
 
 import java.io.File;
 
-import morfologik.stemming.EncoderType;
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.EncodingParameter;
@@ -29,18 +28,30 @@ import opennlp.tools.cmdline.params.EncodingParameter;
  */
 interface MorfologikDictionaryBuilderParams extends EncodingParameter {
 
-  @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+  @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
   File getInputFile();
-
-  @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
-  File getOutputFile();
-
-  @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
-  @OptionalParameter(defaultValue = "+")
-  String getFSADictSeparator();
   
-  @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ")
-  @OptionalParameter(defaultValue = "prefix")
-  EncoderType getEncoderType();
-
+  @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptBOM();
+  
+  @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptCR();
+  
+  @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
+  @OptionalParameter(defaultValue="FSA5")
+  String getFormat();
+  
+  @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getIgnoreEmpty();
+  
+  @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getOverwrite();
+  
+  @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getValidate();
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
index 741515e..eb9b51c 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -17,11 +17,10 @@
 
 package opennlp.morfologik.cmdline.builder;
 
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
 import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
 
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.tools.cmdline.BasicCmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -44,18 +43,16 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
     Params params = validateAndParseParams(args, Params.class);
 
     File dictInFile = params.getInputFile();
-    File dictOutFile = params.getOutputFile();
-    File propertiesFile = getExpectedPropertiesFile(dictOutFile);
-    Charset encoding = params.getEncoding();
 
     CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
-    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
-    CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+    Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
+    CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
 
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
     try {
-      builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
-          params.getFSADictSeparator(), params.getEncoderType());
+      builder.build(dictInFile.toPath(), params.getOverwrite(),
+          params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
+          params.getIgnoreEmpty());
     } catch (Exception e) {
       throw new TerminateToolException(-1,
           "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 99694a5..2090ce5 100644
--- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -18,7 +18,7 @@
 package opennlp.morfologik.lemmatizer;
 
 import java.io.IOException;
-import java.net.URL;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -26,11 +26,11 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import morfologik.stemming.Dictionary;
 import morfologik.stemming.DictionaryLookup;
 import morfologik.stemming.IStemmer;
 import morfologik.stemming.WordData;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 
 public class MorfologikLemmatizer implements DictionaryLemmatizer {
 
@@ -38,9 +38,9 @@ public class MorfologikLemmatizer implements DictionaryLemmatizer {
   public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
       "NNP", "NP00000"));
 
-  public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+  public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
       IOException {
-    dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+    dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
   }
 
   private HashMap<List<String>, String> getLemmaTagsDict(String word) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index f022a86..723b1ce 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -54,9 +54,21 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   public MorfologikPOSTaggerFactory() {
   }
 
+  /**
+   * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}.
+   * 
+   * @param ngramDictionary a ngramDictionary 
+   * @param morfologikDictionary a Morfologik dictionary
+   * @param morfologikDictionaryMetadata the dictionary metadata
+   * @throws IOException invalid Morfologik dictionary
+   */
   public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
-      TagDictionary posDictionary) {
+      byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException {
     super(ngramDictionary, null);
+    this.dictData = morfologikDictionary;
+    this.dictInfo = morfologikDictionaryMetadata;
+    
+    this.dict = createMorfologikDictionary(dictData, dictInfo);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 730025c..0a7ba48 100644
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -18,14 +18,12 @@
 package opennlp.morfologik.builder;
 
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Properties;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 
 import junit.framework.TestCase;
-import morfologik.stemming.EncoderType;
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
 
 import org.junit.Test;
@@ -34,56 +32,27 @@ public class POSDictionayBuilderTest extends TestCase {
 
   @Test
   public void testBuildDictionary() throws Exception {
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
-
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
+    
+    Path output = createMorfologikDictionary();
 
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
-        .toURL());
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
 
     assertNotNull(ml);
   }
-
-  @Test
-  public void testPropertiesCreation() throws Exception {
-
-    Charset c = Charset.forName("iso-8859-1");
-    String sep = "_";
+  
+  public static Path createMorfologikDictionary() throws Exception {
+    Path tabFilePath = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".txt").toPath();
+    Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
     
-    EncoderType encoderType = EncoderType.PREFIX;
-    Properties p = createPropertiesHelper(c, sep, encoderType);
-
-    assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
-    assertEquals(sep, p.getProperty("fsa.dict.separator"));
-    assertEquals(encoderType,
-        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
     
-    encoderType = EncoderType.SUFFIX;
-    p = createPropertiesHelper(c, sep, encoderType);
-    assertEquals(encoderType,
-        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
-
-  }
-
-  private Properties createPropertiesHelper(Charset c, String sep,
-      EncoderType encoderType) throws IOException {
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
-        ".info");
-    builder.createProperties(c, sep, encoderType, f);
-
-    InputStream is = new FileInputStream(f);
-
-    Properties prop = new Properties();
-    prop.load(is);
-    is.close();
-    f.delete();
-    return prop;
+    
+    return builder.build(tabFilePath);
   }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 87fc2cc..6b7525e 100644
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -2,11 +2,8 @@ package opennlp.morfologik.lemmatizer;
 
 import static org.junit.Assert.assertEquals;
 
-import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
 
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
 import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 
@@ -28,17 +25,9 @@ public class MorfologikLemmatizerTest {
   private MorfologikLemmatizer createDictionary(boolean caseSensitive)
       throws Exception {
 
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
+    Path output = POSDictionayBuilderTest.createMorfologikDictionary();
 
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
-        .toURL());
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
 
     return ml;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
index d605e15..c6c9e04 100644
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -3,16 +3,11 @@ package opennlp.morfologik.tagdict;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
-import java.io.File;
-import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.List;
 
 import morfologik.stemming.Dictionary;
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.morfologik.tagdict.MorfologikTagDictionary;
 import opennlp.tools.postag.TagDictionary;
 
 import org.junit.Test;
@@ -74,17 +69,8 @@ public class MorfologikTagDictionaryTest {
   private MorfologikTagDictionary createDictionary(boolean caseSensitive,
       List<String> constant) throws Exception {
 
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
-
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
-    MorfologikTagDictionary ml = new MorfologikTagDictionary(
-        Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+    Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
+    MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
 
     return ml;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
new file mode 100644
index 0000000..6c6814b
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -0,0 +1,108 @@
+///*
+// * Licensed to the Apache Software Foundation (ASF) under one or more
+// * contributor license agreements.  See the NOTICE file distributed with
+// * this work for additional information regarding copyright ownership.
+// * The ASF licenses this file to You under the Apache License, Version 2.0
+// * (the "License"); you may not use this file except in compliance with
+// * the License. You may obtain a copy of the License at
+// *
+// *     http://www.apache.org/licenses/LICENSE-2.0
+// *
+// * Unless required by applicable law or agreed to in writing, software
+// * distributed under the License is distributed on an "AS IS" BASIS,
+// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// * See the License for the specific language governing permissions and
+// * limitations under the License.
+// */
+//
+//package opennlp.morfologik.tagdict;
+//
+//import static org.junit.Assert.assertTrue;
+//
+//import java.io.ByteArrayInputStream;
+//import java.io.ByteArrayOutputStream;
+//import java.io.File;
+//import java.io.IOException;
+//import java.io.InputStream;
+//import java.io.InputStreamReader;
+//import java.nio.charset.Charset;
+//import java.nio.file.Files;
+//import java.nio.file.Path;
+//import java.nio.file.Paths;
+//
+//import morfologik.stemming.DictionaryMetadata;
+//import morfologik.stemming.EncoderType;
+//import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+//import opennlp.morfologik.builder.POSDictionayBuilderTest;
+//import opennlp.tools.dictionary.Dictionary;
+//import opennlp.tools.postag.DefaultPOSSequenceValidator;
+//import opennlp.tools.postag.POSContextGenerator;
+//import opennlp.tools.postag.POSDictionary;
+//import opennlp.tools.postag.POSModel;
+//import opennlp.tools.postag.POSSample;
+//import opennlp.tools.postag.POSTaggerFactory;
+//import opennlp.tools.postag.POSTaggerME;
+//import opennlp.tools.postag.WordTagSampleStream;
+//import opennlp.tools.util.BaseToolFactory;
+//import opennlp.tools.util.InvalidFormatException;
+//import opennlp.tools.util.ObjectStream;
+//import opennlp.tools.util.TrainingParameters;
+//import opennlp.tools.util.model.ModelType;
+//
+//import org.junit.Test;
+//
+///**
+// * Tests for the {@link POSTaggerFactory} class.
+// */
+//public class POSTaggerFactoryTest {
+//
+//  private static ObjectStream<POSSample> createSampleStream()
+//      throws IOException {
+//    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+//        .getResourceAsStream("AnnotatedSentences.txt");
+//
+//    return new WordTagSampleStream((new InputStreamReader(in)));
+//  }
+//
+//  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+//      throws IOException {
+//    return POSTaggerME.train("en", createSampleStream(),
+//        TrainingParameters.defaultParams(), factory);
+//  }
+//
+//  @Test
+//  public void testPOSTaggerWithCustomFactory() throws Exception {
+//
+//    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+//    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+//        "/dictionaryWithLemma.txt").getFile());
+//
+//    File dictOutFile = File.createTempFile(
+//        POSDictionayBuilderTest.class.getName(), ".dict");
+//
+//    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+",
+//        EncoderType.PREFIX);
+//
+//    Path dictPath = dictOutFile.toPath();
+//    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath);
+//
+//    byte[] dic = Files.readAllBytes(dictPath);
+//    byte[] meta = Files.readAllBytes(metaPath);
+//
+//    POSModel posModel = trainPOSModel(ModelType.MAXENT,
+//        new MorfologikPOSTaggerFactory(null, dic, meta));
+//
+//    POSTaggerFactory factory = posModel.getFactory();
+//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+//
+//    ByteArrayOutputStream out = new ByteArrayOutputStream();
+//    posModel.serialize(out);
+//    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+//
+//    POSModel fromSerialized = new POSModel(in);
+//
+//    factory = fromSerialized.getFactory();
+//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+//  }
+//
+//}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt
new file mode 100644
index 0000000..b40be87
--- /dev/null
+++ b/src/test/resources/AnnotatedSentences.txt
@@ -0,0 +1,136 @@
+Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
+I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
+So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
+She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
+I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
+
+Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
+Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
+I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
+As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
+The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
+But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
+It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
+She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
+
+Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
+About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
+That_DT sounds_VBZ good_JJ ._.
+So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
+I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
+Thank_VB you_PRP very_RB much_RB !_.
+
+Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
+I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
+Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
+Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
+
+My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
+We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
+Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
+As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
+But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
+Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
+
+I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
+I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
+I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
+I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
+
+Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
+As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
+The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
+Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
+
+An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
+Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
+She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
+But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
+So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
+It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
+She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
+
+Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
+They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
+Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
+No,_NNP that_DT 's_VBZ faked_VBN ._.
+But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
+I_PRP got_VBD them_PRP from_IN Hamburg._NNP
+Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
+It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
+I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
+Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
+Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
+I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
+His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
+
+Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
+My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
+How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
+His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
+I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
+I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
+Is_VBZ that_DT right_NN ?_.
+Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
+So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
+Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
+Bye_NNP !_.
+
+On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
+The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
+Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
+But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
+Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
+
+Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
+I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
+I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
+An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
+Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
+The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
+He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
+
+Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
+He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
+Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
+Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
+It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
+
+When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
+He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
+One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
+So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
+She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
+Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
+
+On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
+The_DT driver_NN got_VBD badly_RB injured_VBN ._.
+Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
+A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
+He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
+He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
+The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
+Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
+
+Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
+Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
+Yeah_UH for_IN sure_JJ ._.
+How_WRB did_VBD you_PRP know_VB that_DT ?_.
+I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
+Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
+Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
+Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
+But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
+I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
+The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
+I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
+Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
+
+My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
+When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
+My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
+But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
+So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
+Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
+We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
+I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info
new file mode 100644
index 0000000..ad5fe8d
--- /dev/null
+++ b/src/test/resources/dictionaryWithLemma.info
@@ -0,0 +1,15 @@
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code. 
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
index 5ac7111..09d39e3 100644
--- a/src/test/resources/dictionaryWithLemma.txt
+++ b/src/test/resources/dictionaryWithLemma.txt
@@ -1,10 +1,11 @@
-casa	casa	NOUN
-casa	casar	V
-Casa	Casa	PROP
-casinha	casa	NOUN
-casona	casa	NOUN
-menina	menino	NOUN
-menino	menino	NOUN
-menin�o	menino	NOUN
-menininho	menino	NOUN
-carro		NOUN
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN
\ No newline at end of file