You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/11/09 21:10:55 UTC

[01/16] opennlp git commit: OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch.

Repository: opennlp
Updated Branches:
  refs/heads/trunk 92e541c93 -> 49f8e25a1


OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f3e90579
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f3e90579
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f3e90579

Branch: refs/heads/trunk
Commit: f3e90579c5feba71dc4f04adaa4acc5ecc7f72e9
Parents: 
Author: J�rn Kottmann <jo...@apache.org>
Authored: Thu Nov 14 21:24:13 2013 +0000
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Nov 14 21:24:13 2013 +0000

----------------------------------------------------------------------
 pom.xml                                         | 50 ++++++++++
 .../lemmatizer/MorfologikLemmatizer.java        | 96 ++++++++++++++++++++
 2 files changed, 146 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..67e1eaa
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,50 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.opennlp</groupId>
+  <artifactId>morfologik-addon</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>Morfologik Addon</name>
+
+  <url>http://maven.apache.org</url>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.3.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+    <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+   <dependency>
+      <groupId>org.carrot2</groupId>
+      <artifactId>morfologik-stemming</artifactId>
+      <version>1.6.0</version>
+      <scope>compile</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-tools</artifactId>
+      <version>1.6.0-SNAPSHOT</version>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>3.8.1</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
new file mode 100644
index 0000000..99694a5
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+  private IStemmer dictLookup;
+  public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+      "NNP", "NP00000"));
+
+  public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+      IOException {
+    dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+  }
+
+  private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+    List<WordData> wdList = dictLookup.lookup(word);
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+    for (WordData wd : wdList) {
+      List<String> wordLemmaTags = new ArrayList<String>();
+      wordLemmaTags.add(word);
+      wordLemmaTags.add(wd.getTag().toString());
+      dictMap.put(wordLemmaTags, wd.getStem().toString());
+    }
+    return dictMap;
+  }
+
+  private List<String> getDictKeys(String word, String postag) {
+    List<String> keys = new ArrayList<String>();
+    if (constantTags.contains(postag)) {
+      keys.addAll(Arrays.asList(word, postag));
+    } else {
+      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+    }
+    return keys;
+  }
+
+  private HashMap<List<String>, String> getDictMap(String word, String postag) {
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+    if (constantTags.contains(postag)) {
+      dictMap = this.getLemmaTagsDict(word);
+    } else {
+      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+    }
+    return dictMap;
+  }
+
+  public String lemmatize(String word, String postag) {
+    String lemma = null;
+    List<String> keys = this.getDictKeys(word, postag);
+    HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+    // lookup lemma as value of the map
+    String keyValue = dictMap.get(keys);
+    if (keyValue != null) {
+      lemma = keyValue;
+    } else if (keyValue == null && constantTags.contains(postag)) {
+      lemma = word;
+    } else if (keyValue == null && word.toUpperCase() == word) {
+      lemma = word;
+    } else {
+      lemma = word.toLowerCase();
+    }
+    return lemma;
+  }
+}


[15/16] opennlp git commit: OPENNLP-622 Merge branch 'master' of ../opennlp-addons into trunk

Posted by co...@apache.org.
OPENNLP-622 Merge branch 'master' of ../opennlp-addons into trunk


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/9b448044
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/9b448044
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/9b448044

Branch: refs/heads/trunk
Commit: 9b4480446b72f9120283e0da10697657072850f3
Parents: 92e541c 772f31f
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 18:28:46 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 18:28:46 2016 -0200

----------------------------------------------------------------------
 opennlp-morfologik-addon/bin/morfologik-addon   |  20 ++
 .../bin/morfologik-addon.bat                    |  21 ++
 opennlp-morfologik-addon/pom.xml                | 109 +++++++++
 .../src/main/assembly/bin.xml                   |  91 ++++++++
 .../src/main/assembly/src.xml                   |  39 ++++
 .../src/main/bin/morfologik-addon               |  35 +++
 .../src/main/bin/morfologik-addon.bat           |  47 ++++
 .../src/main/bin/opennlp-cp                     |  35 +++
 .../builder/MorfologikDictionayBuilder.java     | 103 +++++++++
 .../java/opennlp/morfologik/cmdline/CLI.java    | 164 +++++++++++++
 .../MorfologikDictionaryBuilderParams.java      |  57 +++++
 .../MorfologikDictionaryBuilderTool.java        |  62 +++++
 .../builder/XMLDictionaryToTableParams.java     |  45 ++++
 .../builder/XMLDictionaryToTableTool.java       | 127 ++++++++++
 .../lemmatizer/MorfologikLemmatizer.java        |  96 ++++++++
 .../tagdict/MorfologikPOSTaggerFactory.java     | 170 ++++++++++++++
 .../tagdict/MorfologikTagDictionary.java        |  90 ++++++++
 .../opennlp/morfologik/util/MorfologikUtil.java |  36 +++
 .../src/main/readme/LICENSE                     | 230 +++++++++++++++++++
 .../src/main/readme/MORFOLOGIK-LICENSE          |  28 +++
 opennlp-morfologik-addon/src/main/readme/NOTICE |  11 +
 .../builder/POSDictionayBuilderTest.java        |  58 +++++
 .../lemmatizer/MorfologikLemmatizerTest.java    |  35 +++
 .../tagdict/MorfologikTagDictionaryTest.java    |  78 +++++++
 .../tagdict/POSTaggerFactoryTest.java           |  88 +++++++
 .../src/test/resources/AnnotatedSentences.txt   | 136 +++++++++++
 .../src/test/resources/dictionaryWithLemma.info |  15 ++
 .../src/test/resources/dictionaryWithLemma.txt  |  11 +
 28 files changed, 2037 insertions(+)
----------------------------------------------------------------------



[04/16] opennlp git commit: OPENNLP-622 Refactored to remove usage of main methods of Morfologik.

Posted by co...@apache.org.
OPENNLP-622 Refactored to remove usage of main methods of Morfologik.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1314887f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1314887f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1314887f

Branch: refs/heads/trunk
Commit: 1314887fe657f21e1213788fd6084a485781f2f1
Parents: 15c3fb7
Author: William Colen <co...@apache.org>
Authored: Thu Jul 7 05:19:18 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 7 05:19:18 2016 +0000

----------------------------------------------------------------------
 .../builder/MorfologikDictionayBuilder.java     | 144 ++++++-------------
 .../MorfologikDictionaryBuilderParams.java      |  37 +++--
 .../MorfologikDictionaryBuilderTool.java        |  17 +--
 .../lemmatizer/MorfologikLemmatizer.java        |   8 +-
 .../tagdict/MorfologikPOSTaggerFactory.java     |  14 +-
 .../builder/POSDictionayBuilderTest.java        |  67 +++------
 .../lemmatizer/MorfologikLemmatizerTest.java    |  17 +--
 .../tagdict/MorfologikTagDictionaryTest.java    |  18 +--
 .../tagdict/POSTaggerFactoryTest.java           | 108 ++++++++++++++
 src/test/resources/AnnotatedSentences.txt       | 136 ++++++++++++++++++
 src/test/resources/dictionaryWithLemma.info     |  15 ++
 src/test/resources/dictionaryWithLemma.txt      |  21 +--
 12 files changed, 386 insertions(+), 216 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
index 0131318..dbbca4d 100644
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -17,21 +17,15 @@
 
 package opennlp.morfologik.builder;
 
-import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
 import java.util.Properties;
 
 import morfologik.stemming.DictionaryMetadata;
 import morfologik.stemming.EncoderType;
-import morfologik.tools.FSACompile;
-import morfologik.tools.Launcher;
+import morfologik.tools.DictCompile;
 
 /**
  * Utility class to build Morfologik dictionaries from a tab separated values
@@ -41,117 +35,69 @@ import morfologik.tools.Launcher;
 public class MorfologikDictionayBuilder {
 
   /**
-   * Build a Morfologik binary dictionary
-   *
-   * @param dictInFile
-   *          the 3 column TSV dictionary file
-   * @param dictOutFile
-   *          where to store the binary Morfologik dictionary
-   * @param encoding
-   *          the encoding to be used while reading and writing
-   * @param separator
-   *          a field separator, the default is '+'. If your tags contains '+'
-   *          change to something else
-   * @param encoderType
-   *          the Morfologik enconder type
-   * @param isUseInfixes
-   *          if to compact using infixes
+   * Helper to compile a morphological dictionary automaton.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   * @param overwrite
+   *          Overwrite the output file if it exists.
+   * @param validate
+   *          Validate input to make sure it makes sense.
+   * @param acceptBom
+   *          Accept leading BOM bytes (UTF-8).
+   * @param acceptCr
+   *          Accept CR bytes in input sequences (\r).
+   * @param ignoreEmpty
+   *          Ignore empty lines in the input.
+   * @return the dictionary path
+   * 
    * @throws Exception
    */
-  public void build(File dictInFile, File dictOutFile, Charset encoding,
-      String separator, EncoderType encoderType)
+  public Path build(Path input, boolean overwrite, boolean validate,
+      boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
       throws Exception {
-    Path propertiesPath = DictionaryMetadata
-        .getExpectedMetadataLocation(dictOutFile.toPath()); 
+
+    DictCompile compiler = new DictCompile(input, overwrite, validate,
+        acceptBom, acceptCr, ignoreEmpty);
+    compiler.call();
+
+    
+    Path metadataPath = DictionaryMetadata
+        .getExpectedMetadataLocation(input);
     
-    this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator,
-        encoderType);
+    return metadataPath.resolveSibling(
+        metadataPath.getFileName().toString().replaceAll(
+            "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
   }
 
   /**
-   * Build a Morfologik binary dictionary
-   *
-   * @param dictInFile
-   *          the 3 column TSV dictionary file
-   * @param dictOutFile
-   *          where to store the binary Morfologik dictionary
-   * @param propertiesOutFile
-   *          where to store the properties of the Morfologik dictionary
-   * @param encoding
-   *          the encoding to be used while reading and writing
-   * @param separator
-   *          a field separator, the default is '+'. If your tags contains '+'
-   *          change to something else
-   * @param isUsePrefixes
-   *          if to compact using prefixes
-   * @param isUseInfixes
-   *          if to compact using infixes
+   * Helper to compile a morphological dictionary automaton using default
+   * parameters.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   *          
+   *  @return the dictionary path
+   * 
    * @throws Exception
    */
-  public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
-      Charset encoding, String separator, EncoderType encoderType) throws Exception {
-
-    // we need to execute tab2morph followed by fsa_build
-
-    File morph = tab2morph(dictInFile, separator, encoderType);
+  public Path build(Path input) throws Exception {
 
-    fsaBuild(morph, dictOutFile);
+    return build(input, true, true, false, false, false);
 
-    morph.delete();
-
-    // now we create the properties files using the passed parameters
-    createProperties(encoding, separator, encoderType,
-        propertiesOutFile);
   }
 
-  void createProperties(Charset encoding, String separator,
-		  EncoderType encoderType, File propertiesFile)
-      throws FileNotFoundException, IOException {
+  Properties createProperties(Charset encoding, String separator,
+      EncoderType encoderType) throws FileNotFoundException, IOException {
 
     Properties properties = new Properties();
     properties.setProperty("fsa.dict.separator", separator);
     properties.setProperty("fsa.dict.encoding", encoding.name());
     properties.setProperty("fsa.dict.encoder", encoderType.name());
 
-    OutputStream os = new FileOutputStream(propertiesFile);
-    properties.store(os, "Morfologik POS Dictionary properties");
-    os.close();
-
-  }
+    return properties;
 
-  private void fsaBuild(File morph, File dictOutFile) throws Exception {
-    String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
-        dictOutFile.getAbsolutePath() };
-    FSACompile.main(params);
-    // FSABuildTool.main(params);
   }
-
-  private File tab2morph(File dictInFile, String separator,
-      EncoderType encoderType) throws Exception {
-
-    // create tab2morph parameters
-    List<String> tag2morphParams = new ArrayList<String>();
-    tag2morphParams.add("tab2morph");
-
-    tag2morphParams.add("--annotation");
-    tag2morphParams.add(separator);
-    
-    tag2morphParams.add("--e");
-    tag2morphParams.add(encoderType.name());
-
-    tag2morphParams.add("-i");
-    tag2morphParams.add(dictInFile.getAbsolutePath());
-
-    // we need a temporary file to store the intermediate output
-    File tmp = File.createTempFile("tab2morph", ".txt");
-    tmp.deleteOnExit();
-
-    tag2morphParams.add("-o");
-    tag2morphParams.add(tmp.getAbsolutePath());
-
-    Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
-
-    return tmp;
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
index 193599b..5ea2e4f 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -19,7 +19,6 @@ package opennlp.morfologik.cmdline.builder;
 
 import java.io.File;
 
-import morfologik.stemming.EncoderType;
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.EncodingParameter;
@@ -29,18 +28,30 @@ import opennlp.tools.cmdline.params.EncodingParameter;
  */
 interface MorfologikDictionaryBuilderParams extends EncodingParameter {
 
-  @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+  @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
   File getInputFile();
-
-  @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
-  File getOutputFile();
-
-  @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
-  @OptionalParameter(defaultValue = "+")
-  String getFSADictSeparator();
   
-  @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ")
-  @OptionalParameter(defaultValue = "prefix")
-  EncoderType getEncoderType();
-
+  @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptBOM();
+  
+  @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptCR();
+  
+  @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
+  @OptionalParameter(defaultValue="FSA5")
+  String getFormat();
+  
+  @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getIgnoreEmpty();
+  
+  @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getOverwrite();
+  
+  @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getValidate();
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
index 741515e..eb9b51c 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -17,11 +17,10 @@
 
 package opennlp.morfologik.cmdline.builder;
 
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
 import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
 
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.tools.cmdline.BasicCmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -44,18 +43,16 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
     Params params = validateAndParseParams(args, Params.class);
 
     File dictInFile = params.getInputFile();
-    File dictOutFile = params.getOutputFile();
-    File propertiesFile = getExpectedPropertiesFile(dictOutFile);
-    Charset encoding = params.getEncoding();
 
     CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
-    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
-    CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+    Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
+    CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
 
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
     try {
-      builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
-          params.getFSADictSeparator(), params.getEncoderType());
+      builder.build(dictInFile.toPath(), params.getOverwrite(),
+          params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
+          params.getIgnoreEmpty());
     } catch (Exception e) {
       throw new TerminateToolException(-1,
           "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 99694a5..2090ce5 100644
--- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -18,7 +18,7 @@
 package opennlp.morfologik.lemmatizer;
 
 import java.io.IOException;
-import java.net.URL;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -26,11 +26,11 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 import morfologik.stemming.Dictionary;
 import morfologik.stemming.DictionaryLookup;
 import morfologik.stemming.IStemmer;
 import morfologik.stemming.WordData;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 
 public class MorfologikLemmatizer implements DictionaryLemmatizer {
 
@@ -38,9 +38,9 @@ public class MorfologikLemmatizer implements DictionaryLemmatizer {
   public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
       "NNP", "NP00000"));
 
-  public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+  public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
       IOException {
-    dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+    dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
   }
 
   private HashMap<List<String>, String> getLemmaTagsDict(String word) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index f022a86..723b1ce 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -54,9 +54,21 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   public MorfologikPOSTaggerFactory() {
   }
 
+  /**
+   * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}.
+   * 
+   * @param ngramDictionary a ngramDictionary 
+   * @param morfologikDictionary a Morfologik dictionary
+   * @param morfologikDictionaryMetadata the dictionary metadata
+   * @throws IOException invalid Morfologik dictionary
+   */
   public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
-      TagDictionary posDictionary) {
+      byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException {
     super(ngramDictionary, null);
+    this.dictData = morfologikDictionary;
+    this.dictInfo = morfologikDictionaryMetadata;
+    
+    this.dict = createMorfologikDictionary(dictData, dictInfo);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 730025c..0a7ba48 100644
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -18,14 +18,12 @@
 package opennlp.morfologik.builder;
 
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Properties;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
 
 import junit.framework.TestCase;
-import morfologik.stemming.EncoderType;
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
 
 import org.junit.Test;
@@ -34,56 +32,27 @@ public class POSDictionayBuilderTest extends TestCase {
 
   @Test
   public void testBuildDictionary() throws Exception {
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
-
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
+    
+    Path output = createMorfologikDictionary();
 
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
-        .toURL());
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
 
     assertNotNull(ml);
   }
-
-  @Test
-  public void testPropertiesCreation() throws Exception {
-
-    Charset c = Charset.forName("iso-8859-1");
-    String sep = "_";
+  
+  public static Path createMorfologikDictionary() throws Exception {
+    Path tabFilePath = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".txt").toPath();
+    Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
     
-    EncoderType encoderType = EncoderType.PREFIX;
-    Properties p = createPropertiesHelper(c, sep, encoderType);
-
-    assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
-    assertEquals(sep, p.getProperty("fsa.dict.separator"));
-    assertEquals(encoderType,
-        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
     
-    encoderType = EncoderType.SUFFIX;
-    p = createPropertiesHelper(c, sep, encoderType);
-    assertEquals(encoderType,
-        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
-
-  }
-
-  private Properties createPropertiesHelper(Charset c, String sep,
-      EncoderType encoderType) throws IOException {
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
-        ".info");
-    builder.createProperties(c, sep, encoderType, f);
-
-    InputStream is = new FileInputStream(f);
-
-    Properties prop = new Properties();
-    prop.load(is);
-    is.close();
-    f.delete();
-    return prop;
+    
+    return builder.build(tabFilePath);
   }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 87fc2cc..6b7525e 100644
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -2,11 +2,8 @@ package opennlp.morfologik.lemmatizer;
 
 import static org.junit.Assert.assertEquals;
 
-import java.io.File;
-import java.nio.charset.Charset;
+import java.nio.file.Path;
 
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
 import opennlp.tools.lemmatizer.DictionaryLemmatizer;
 
@@ -28,17 +25,9 @@ public class MorfologikLemmatizerTest {
   private MorfologikLemmatizer createDictionary(boolean caseSensitive)
       throws Exception {
 
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
+    Path output = POSDictionayBuilderTest.createMorfologikDictionary();
 
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
-        .toURL());
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
 
     return ml;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
index d605e15..c6c9e04 100644
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -3,16 +3,11 @@ package opennlp.morfologik.tagdict;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
-import java.io.File;
-import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.List;
 
 import morfologik.stemming.Dictionary;
-import morfologik.stemming.EncoderType;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.morfologik.tagdict.MorfologikTagDictionary;
 import opennlp.tools.postag.TagDictionary;
 
 import org.junit.Test;
@@ -74,17 +69,8 @@ public class MorfologikTagDictionaryTest {
   private MorfologikTagDictionary createDictionary(boolean caseSensitive,
       List<String> constant) throws Exception {
 
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-        "/dictionaryWithLemma.txt").getFile());
-
-    File dictOutFile = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".dict");
-
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
-
-    MorfologikTagDictionary ml = new MorfologikTagDictionary(
-        Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+    Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
+    MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
 
     return ml;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
new file mode 100644
index 0000000..6c6814b
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -0,0 +1,108 @@
+///*
+// * Licensed to the Apache Software Foundation (ASF) under one or more
+// * contributor license agreements.  See the NOTICE file distributed with
+// * this work for additional information regarding copyright ownership.
+// * The ASF licenses this file to You under the Apache License, Version 2.0
+// * (the "License"); you may not use this file except in compliance with
+// * the License. You may obtain a copy of the License at
+// *
+// *     http://www.apache.org/licenses/LICENSE-2.0
+// *
+// * Unless required by applicable law or agreed to in writing, software
+// * distributed under the License is distributed on an "AS IS" BASIS,
+// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// * See the License for the specific language governing permissions and
+// * limitations under the License.
+// */
+//
+//package opennlp.morfologik.tagdict;
+//
+//import static org.junit.Assert.assertTrue;
+//
+//import java.io.ByteArrayInputStream;
+//import java.io.ByteArrayOutputStream;
+//import java.io.File;
+//import java.io.IOException;
+//import java.io.InputStream;
+//import java.io.InputStreamReader;
+//import java.nio.charset.Charset;
+//import java.nio.file.Files;
+//import java.nio.file.Path;
+//import java.nio.file.Paths;
+//
+//import morfologik.stemming.DictionaryMetadata;
+//import morfologik.stemming.EncoderType;
+//import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+//import opennlp.morfologik.builder.POSDictionayBuilderTest;
+//import opennlp.tools.dictionary.Dictionary;
+//import opennlp.tools.postag.DefaultPOSSequenceValidator;
+//import opennlp.tools.postag.POSContextGenerator;
+//import opennlp.tools.postag.POSDictionary;
+//import opennlp.tools.postag.POSModel;
+//import opennlp.tools.postag.POSSample;
+//import opennlp.tools.postag.POSTaggerFactory;
+//import opennlp.tools.postag.POSTaggerME;
+//import opennlp.tools.postag.WordTagSampleStream;
+//import opennlp.tools.util.BaseToolFactory;
+//import opennlp.tools.util.InvalidFormatException;
+//import opennlp.tools.util.ObjectStream;
+//import opennlp.tools.util.TrainingParameters;
+//import opennlp.tools.util.model.ModelType;
+//
+//import org.junit.Test;
+//
+///**
+// * Tests for the {@link POSTaggerFactory} class.
+// */
+//public class POSTaggerFactoryTest {
+//
+//  private static ObjectStream<POSSample> createSampleStream()
+//      throws IOException {
+//    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+//        .getResourceAsStream("AnnotatedSentences.txt");
+//
+//    return new WordTagSampleStream((new InputStreamReader(in)));
+//  }
+//
+//  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+//      throws IOException {
+//    return POSTaggerME.train("en", createSampleStream(),
+//        TrainingParameters.defaultParams(), factory);
+//  }
+//
+//  @Test
+//  public void testPOSTaggerWithCustomFactory() throws Exception {
+//
+//    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+//    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+//        "/dictionaryWithLemma.txt").getFile());
+//
+//    File dictOutFile = File.createTempFile(
+//        POSDictionayBuilderTest.class.getName(), ".dict");
+//
+//    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+",
+//        EncoderType.PREFIX);
+//
+//    Path dictPath = dictOutFile.toPath();
+//    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath);
+//
+//    byte[] dic = Files.readAllBytes(dictPath);
+//    byte[] meta = Files.readAllBytes(metaPath);
+//
+//    POSModel posModel = trainPOSModel(ModelType.MAXENT,
+//        new MorfologikPOSTaggerFactory(null, dic, meta));
+//
+//    POSTaggerFactory factory = posModel.getFactory();
+//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+//
+//    ByteArrayOutputStream out = new ByteArrayOutputStream();
+//    posModel.serialize(out);
+//    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+//
+//    POSModel fromSerialized = new POSModel(in);
+//
+//    factory = fromSerialized.getFactory();
+//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
+//  }
+//
+//}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt
new file mode 100644
index 0000000..b40be87
--- /dev/null
+++ b/src/test/resources/AnnotatedSentences.txt
@@ -0,0 +1,136 @@
+Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
+I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
+So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
+She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
+I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
+
+Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
+Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
+I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
+As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
+The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
+But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
+It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
+She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
+
+Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
+About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
+That_DT sounds_VBZ good_JJ ._.
+So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
+I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
+Thank_VB you_PRP very_RB much_RB !_.
+
+Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
+I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
+Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
+Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
+
+My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
+We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
+Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
+As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
+But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
+Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
+
+I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
+I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
+I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
+I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
+
+Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
+As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
+The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
+Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
+
+An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
+Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
+She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
+But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
+So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
+It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
+She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
+
+Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
+They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
+Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
+No,_NNP that_DT 's_VBZ faked_VBN ._.
+But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
+I_PRP got_VBD them_PRP from_IN Hamburg._NNP
+Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
+It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
+I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
+Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
+Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
+I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
+His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
+
+Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
+My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
+How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
+His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
+I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
+I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
+Is_VBZ that_DT right_NN ?_.
+Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
+So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
+Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
+Bye_NNP !_.
+
+On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
+The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
+Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
+But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
+Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
+
+Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
+I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
+I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
+An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
+Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
+The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
+He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
+
+Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
+He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
+Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
+Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
+It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
+
+When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
+He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
+One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
+So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
+She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
+Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
+
+On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
+The_DT driver_NN got_VBD badly_RB injured_VBN ._.
+Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
+A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
+He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
+He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
+The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
+Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
+
+Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
+Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
+Yeah_UH for_IN sure_JJ ._.
+How_WRB did_VBD you_PRP know_VB that_DT ?_.
+I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
+Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
+Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
+Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
+But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
+I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
+The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
+I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
+Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
+
+My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
+When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
+My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
+But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
+So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
+Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
+We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
+I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info
new file mode 100644
index 0000000..ad5fe8d
--- /dev/null
+++ b/src/test/resources/dictionaryWithLemma.info
@@ -0,0 +1,15 @@
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code. 
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
index 5ac7111..09d39e3 100644
--- a/src/test/resources/dictionaryWithLemma.txt
+++ b/src/test/resources/dictionaryWithLemma.txt
@@ -1,10 +1,11 @@
-casa	casa	NOUN
-casa	casar	V
-Casa	Casa	PROP
-casinha	casa	NOUN
-casona	casa	NOUN
-menina	menino	NOUN
-menino	menino	NOUN
-menin�o	menino	NOUN
-menininho	menino	NOUN
-carro		NOUN
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN
\ No newline at end of file


[02/16] opennlp git commit: OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger m

Posted by co...@apache.org.
OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger models.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/78dd579b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/78dd579b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/78dd579b

Branch: refs/heads/trunk
Commit: 78dd579b0e013b3132caae35afe71113764742e9
Parents: f3e9057
Author: William Colen <co...@apache.org>
Authored: Mon Dec 2 13:23:04 2013 +0000
Committer: William Colen <co...@apache.org>
Committed: Mon Dec 2 13:23:04 2013 +0000

----------------------------------------------------------------------
 pom.xml                                         |  19 +-
 .../builder/MorfologikDictionayBuilder.java     | 163 ++++++++++++++++
 .../java/opennlp/morfologik/cmdline/CLI.java    | 164 +++++++++++++++++
 .../MorfologikDictionaryBuilderParams.java      |  49 +++++
 .../MorfologikDictionaryBuilderTool.java        |  71 +++++++
 .../builder/XMLDictionaryToTableParams.java     |  36 ++++
 .../builder/XMLDictionaryToTableTool.java       |  82 +++++++++
 .../tagdict/MorfologikPOSTaggerFactory.java     | 184 +++++++++++++++++++
 .../tagdict/MorfologikTagDictionary.java        |  90 +++++++++
 .../builder/POSDictionayBuilderTest.java        | 101 ++++++++++
 .../lemmatizer/MorfologikLemmatizerTest.java    |  46 +++++
 .../tagdict/MorfologikTagDictionaryTest.java    |  92 ++++++++++
 src/test/resources/dictionaryWithLemma.txt      |  10 +
 13 files changed, 1101 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 67e1eaa..51854f6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -33,6 +33,12 @@
       <version>1.6.0</version>
       <scope>compile</scope>
     </dependency>
+   <dependency>
+      <groupId>org.carrot2</groupId>
+      <artifactId>morfologik-tools</artifactId>
+      <version>1.6.0</version>
+      <scope>compile</scope>
+    </dependency>
 
     <dependency>
       <groupId>org.apache.opennlp</groupId>
@@ -40,11 +46,12 @@
       <version>1.6.0-SNAPSHOT</version>
     </dependency>
 
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>3.8.1</version>
-      <scope>test</scope>
-    </dependency>
+	<dependency>
+		<groupId>junit</groupId>
+		<artifactId>junit</artifactId>
+		<version>4.8.1</version>
+		<scope>test</scope>
+	</dependency>
+
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
new file mode 100644
index 0000000..b8bcfbf
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import morfologik.stemming.Dictionary;
+import morfologik.tools.FSABuildTool;
+import morfologik.tools.Launcher;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+  /**
+   * Build a Morfologik binary dictionary
+   *
+   * @param dictInFile
+   *          the 3 column TSV dictionary file
+   * @param dictOutFile
+   *          where to store the binary Morfologik dictionary
+   * @param encoding
+   *          the encoding to be used while reading and writing
+   * @param separator
+   *          a field separator, the default is '+'. If your tags contains '+'
+   *          change to something else
+   * @param isUsePrefixes
+   *          if to compact using prefixes
+   * @param isUseInfixes
+   *          if to compact using infixes
+   * @throws Exception
+   */
+  public void build(File dictInFile, File dictOutFile, Charset encoding,
+      String separator, boolean isUsePrefixes, boolean isUseInfixes)
+      throws Exception {
+
+    File propertiesFile = new File(
+        Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
+    this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
+        isUsePrefixes, isUseInfixes);
+  }
+
+  /**
+   * Build a Morfologik binary dictionary
+   *
+   * @param dictInFile
+   *          the 3 column TSV dictionary file
+   * @param dictOutFile
+   *          where to store the binary Morfologik dictionary
+   * @param propertiesOutFile
+   *          where to store the properties of the Morfologik dictionary
+   * @param encoding
+   *          the encoding to be used while reading and writing
+   * @param separator
+   *          a field separator, the default is '+'. If your tags contains '+'
+   *          change to something else
+   * @param isUsePrefixes
+   *          if to compact using prefixes
+   * @param isUseInfixes
+   *          if to compact using infixes
+   * @throws Exception
+   */
+  public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
+      Charset encoding, String separator, boolean isUsePrefixes,
+      boolean isUseInfixes) throws Exception {
+
+    // we need to execute tab2morph followed by fsa_build
+
+    File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+
+    fsaBuild(morph, dictOutFile);
+
+    morph.delete();
+
+    // now we create the properties files using the passed parameters
+    createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+        propertiesOutFile);
+  }
+
+  void createProperties(Charset encoding, String separator,
+      boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+      throws FileNotFoundException, IOException {
+
+    Properties properties = new Properties();
+    properties.setProperty("fsa.dict.separator", separator);
+    properties.setProperty("fsa.dict.encoding", encoding.name());
+    properties.setProperty("fsa.dict.uses-prefixes",
+        Boolean.toString(isUsePrefixes));
+    properties.setProperty("fsa.dict.uses-infixes",
+        Boolean.toString(isUseInfixes));
+
+    OutputStream os = new FileOutputStream(propertiesFile);
+    properties.store(os, "Morfologik POS Dictionary properties");
+    os.close();
+
+  }
+
+  private void fsaBuild(File morph, File dictOutFile) throws Exception {
+    String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
+        dictOutFile.getAbsolutePath() };
+    FSABuildTool.main(params);
+  }
+
+  private File tab2morph(File dictInFile, String separator,
+      boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+
+    // create tab2morph parameters
+    List<String> tag2morphParams = new ArrayList<String>();
+    tag2morphParams.add("tab2morph");
+
+    tag2morphParams.add("--annotation");
+    tag2morphParams.add(separator);
+
+    if (isUsePrefixes) {
+      tag2morphParams.add("-pre");
+    }
+
+    if (isUseInfixes) {
+      tag2morphParams.add("-inf");
+    }
+
+    tag2morphParams.add("-i");
+    tag2morphParams.add(dictInFile.getAbsolutePath());
+
+    // we need a temporary file to store the intermediate output
+    File tmp = File.createTempFile("tab2morph", ".txt");
+    tmp.deleteOnExit();
+
+    tag2morphParams.add("-o");
+    tag2morphParams.add(tmp.getAbsolutePath());
+
+    Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
+
+    return tmp;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
new file mode 100644
index 0000000..66a5151
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+	public static final String CMD = "opennlp-morfologik-addon";
+
+	private static Map<String, CmdLineTool> toolLookupMap;
+
+	static {
+		toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+		List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+		tools.add(new MorfologikDictionaryBuilderTool());
+		tools.add(new XMLDictionaryToTableTool());
+
+		for (CmdLineTool tool : tools) {
+			toolLookupMap.put(tool.getName(), tool);
+		}
+
+		toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+	}
+
+	/**
+	 * @return a set which contains all tool names
+	 */
+	public static Set<String> getToolNames() {
+		return toolLookupMap.keySet();
+	}
+
+	private static void usage() {
+		System.out.print("OpenNLP Morfologik Addon "
+				+ Version.currentVersion().toString() + ". ");
+		System.out.println("Usage: " + CMD + " TOOL");
+		System.out.println("where TOOL is one of:");
+
+		// distance of tool name from line start
+		int numberOfSpaces = -1;
+		for (String toolName : toolLookupMap.keySet()) {
+			if (toolName.length() > numberOfSpaces) {
+				numberOfSpaces = toolName.length();
+			}
+		}
+		numberOfSpaces = numberOfSpaces + 4;
+
+		for (CmdLineTool tool : toolLookupMap.values()) {
+
+			System.out.print("  " + tool.getName());
+
+			for (int i = 0; i < Math.abs(tool.getName().length()
+					- numberOfSpaces); i++) {
+				System.out.print(" ");
+			}
+
+			System.out.println(tool.getShortDescription());
+		}
+
+		System.out
+				.println("All tools print help when invoked with help parameter");
+		System.out
+				.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+	}
+
+  public static void main(String[] args) {
+
+		if (args.length == 0) {
+			usage();
+			System.exit(0);
+		}
+
+		String toolArguments[] = new String[args.length - 1];
+		System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+		String toolName = args[0];
+
+		// check for format
+		String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+		int idx = toolName.indexOf(".");
+		if (-1 < idx) {
+			formatName = toolName.substring(idx + 1);
+			toolName = toolName.substring(0, idx);
+		}
+		CmdLineTool tool = toolLookupMap.get(toolName);
+
+		try {
+			if (null == tool) {
+				throw new TerminateToolException(1, "Tool " + toolName
+						+ " is not found.");
+			}
+
+			if ((0 == toolArguments.length && tool.hasParams())
+					|| 0 < toolArguments.length
+					&& "help".equals(toolArguments[0])) {
+				if (tool instanceof TypedCmdLineTool) {
+					System.out.println(((TypedCmdLineTool) tool)
+							.getHelp(formatName));
+				} else if (tool instanceof BasicCmdLineTool) {
+					System.out.println(tool.getHelp());
+				}
+
+				System.exit(0);
+			}
+
+			if (tool instanceof TypedCmdLineTool) {
+				((TypedCmdLineTool) tool).run(formatName, toolArguments);
+			} else if (tool instanceof BasicCmdLineTool) {
+				if (-1 == idx) {
+					((BasicCmdLineTool) tool).run(toolArguments);
+				} else {
+					throw new TerminateToolException(1, "Tool " + toolName
+							+ " does not support formats.");
+				}
+			} else {
+				throw new TerminateToolException(1, "Tool " + toolName
+						+ " is not supported.");
+			}
+		} catch (TerminateToolException e) {
+
+			if (e.getMessage() != null) {
+				System.err.println(e.getMessage());
+			}
+
+			if (e.getCause() != null) {
+				System.err.println(e.getCause().getMessage());
+				e.getCause().printStackTrace(System.err);
+			}
+
+			System.exit(e.getCode());
+		}
+	}
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
new file mode 100644
index 0000000..0b1e896
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+  File getInputFile();
+
+  @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
+  File getOutputFile();
+
+  @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
+  @OptionalParameter(defaultValue = "+")
+  String getFSADictSeparator();
+
+  @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUsesPrefixes();
+
+  @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUsesInfixes();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
new file mode 100644
index 0000000..9da7e7d
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+  interface Params extends MorfologikDictionaryBuilderParams {
+  }
+
+  public String getShortDescription() {
+    return "builds a binary POS Dictionary using Morfologik";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+    File dictOutFile = params.getOutputFile();
+    File propertiesFile = getExpectedPropertiesFile(dictOutFile);
+    Charset encoding = params.getEncoding();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+    CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    try {
+      builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
+          params.getFSADictSeparator(), params.getUsesPrefixes(),
+          params.getUsesInfixes());
+    } catch (Exception e) {
+      throw new TerminateToolException(-1,
+          "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+    }
+
+  }
+
+  private File getExpectedPropertiesFile(File dictFile) {
+    return new File(Dictionary.getExpectedFeaturesName(dictFile
+        .getAbsolutePath()));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
new file mode 100644
index 0000000..b88cc5d
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+  File getInputFile();
+
+  @ParameterDescription(valueName = "out", description = "Tab separated format.")
+  File getOutputFile();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
new file mode 100644
index 0000000..c87f016
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.util.Iterator;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+  interface Params extends XMLDictionaryToTableParams {
+  }
+
+  public String getShortDescription() {
+    return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+    File dictOutFile = params.getOutputFile();
+    Charset encoding = params.getEncoding();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+    POSDictionary tagDictionary = null;
+    try {
+      tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+    }
+    Iterator<String> iterator = tagDictionary.iterator();
+
+    try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+        encoding)) {
+      while (iterator.hasNext()) {
+        String word = iterator.next();
+        String wordAndLemma = word + "\t\t"; // lemma is empty
+        for (String tag : tagDictionary.getTags(word)) {
+          writer.write(wordAndLemma + tag);
+          writer.newLine();
+        }
+      }
+      writer.close();
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing output: "
+          + e.getMessage(), e);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
new file mode 100644
index 0000000..9b74ae5
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+  private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+  private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+  private static final String MORFOLOGIK_POSDICT = "tagdict."
+      + MORFOLOGIK_POSDICT_SUF;
+  private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+      + MORFOLOGIK_DICT_INFO_SUF;
+
+  private TagDictionary dict;
+
+  private byte[] dictInfo;
+  private byte[] dictData;
+
+  public MorfologikPOSTaggerFactory() {
+  }
+
+  public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
+      TagDictionary posDictionary) {
+    super(ngramDictionary, null);
+  }
+
+  @Override
+  protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+    super.init(ngramDictionary, null);
+    this.dict = posDictionary;
+
+    // get the dictionary path
+    String path = System.getProperty("morfologik.dict");
+    if (path == null) {
+      throw new IllegalArgumentException(
+          "The property fsa.dict is missing! -Dmorfologik.dict=path");
+    }
+
+    // now we try to load it...
+    try {
+      this.dictData = Files.readAllBytes(Paths.get(path));
+      this.dictInfo = Files.readAllBytes(Paths
+          .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+
+      this.dict = createMorfologikDictionary(dictData, dictInfo);
+
+    } catch (IllegalArgumentException e) {
+      throw new IllegalArgumentException(
+          "The file is not a Morfologik dictionary!", e);
+    } catch (IOException e) {
+      throw new IllegalArgumentException(
+          "Could not open the Morfologik dictionary or the .info file", e);
+    }
+  }
+
+  @Override
+  public TagDictionary getTagDictionary() {
+    if (this.dict == null) {
+
+      if (artifactProvider != null) {
+        Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+        if (obj != null) {
+          byte[] data = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_POSDICT);
+          byte[] info = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_DICT_INFO);
+
+          try {
+            this.dict = createMorfologikDictionary(data, info);
+          } catch (IllegalArgumentException e) {
+            throw new RuntimeException(
+                "Could not load the dictionary files to Morfologik.", e);
+          } catch (IOException e) {
+            throw new RuntimeException(
+                "IO error while reading the Morfologik dictionary files.", e);
+          }
+        }
+      }
+    }
+
+    return this.dict;
+  }
+
+  @Override
+  public void setTagDictionary(TagDictionary dictionary) {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createEmptyTagDictionary() {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createTagDictionary(File dictionary)
+      throws InvalidFormatException, FileNotFoundException, IOException {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createTagDictionary(InputStream in)
+      throws InvalidFormatException, IOException {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  @SuppressWarnings("rawtypes")
+  public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+    Map<String, ArtifactSerializer> serializers = super
+        .createArtifactSerializersMap();
+
+    serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+    serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+    return serializers;
+  }
+
+  @Override
+  public Map<String, Object> createArtifactMap() {
+    Map<String, Object> artifactMap = super.createArtifactMap();
+    artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+    artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+    return artifactMap;
+  }
+
+  private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+      throws IOException {
+    morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+        .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+            info));
+    return new MorfologikTagDictionary(dict);
+  }
+
+  static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+    public byte[] create(InputStream in) throws IOException,
+        InvalidFormatException {
+
+      return ModelUtil.read(in);
+    }
+
+    public void serialize(byte[] artifact, OutputStream out) throws IOException {
+      out.write(artifact);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
new file mode 100644
index 0000000..b34ca2b
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+  private IStemmer dictLookup;
+  private boolean isCaseSensitive;
+
+  /**
+   * Creates a case sensitive {@link MorfologikTagDictionary}
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict)
+      throws IllegalArgumentException, IOException {
+    this(dict, true);
+  }
+
+  /**
+   * Creates MorfologikLemmatizer
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @param caseSensitive
+   *          if true it performs case sensitive lookup
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+      throws IllegalArgumentException, IOException {
+    this.dictLookup = new DictionaryLookup(dict);
+    this.isCaseSensitive = caseSensitive;
+  }
+
+  @Override
+  public String[] getTags(String word) {
+    if (!isCaseSensitive) {
+      word = word.toLowerCase();
+    }
+
+    List<WordData> data = dictLookup.lookup(word);
+    if (data != null && data.size() > 0) {
+      List<String> tags = new ArrayList<String>(data.size());
+      for (int i = 0; i < data.size(); i++) {
+        tags.add(data.get(i).getTag().toString());
+      }
+      if (tags.size() > 0)
+        return tags.toArray(new String[tags.size()]);
+      return null;
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
new file mode 100644
index 0000000..16d1dac
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Properties;
+
+import junit.framework.TestCase;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+  @Test
+  public void testBuildDictionary() throws Exception {
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+        .toURL());
+
+    assertNotNull(ml);
+  }
+
+  @Test
+  public void testPropertiesCreation() throws Exception {
+
+    Charset c = Charset.forName("iso-8859-1");
+    String sep = "_";
+    boolean pref = true;
+    boolean inf = true;
+    Properties p = createPropertiesHelper(c, sep, pref, inf);
+
+    assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
+    assertEquals(sep, p.getProperty("fsa.dict.separator"));
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+    pref = false;
+    inf = true;
+    p = createPropertiesHelper(c, sep, pref, inf);
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+    pref = true;
+    inf = false;
+    p = createPropertiesHelper(c, sep, pref, inf);
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+  }
+
+  private Properties createPropertiesHelper(Charset c, String sep,
+      boolean pref, boolean inf) throws IOException {
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
+        ".info");
+    builder.createProperties(c, sep, pref, inf, f);
+
+    InputStream is = new FileInputStream(f);
+
+    Properties prop = new Properties();
+    prop.load(is);
+    is.close();
+    f.delete();
+    return prop;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
new file mode 100644
index 0000000..6fd6ec1
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -0,0 +1,46 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+  @Test
+  public void testLemmatizeInsensitive() throws Exception {
+    DictionaryLemmatizer dict = createDictionary(false);
+
+    assertEquals("casar", dict.lemmatize("casa", "V"));
+    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+  }
+
+  private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+      throws Exception {
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+        .toURL());
+
+    return ml;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
new file mode 100644
index 0000000..def97b6
--- /dev/null
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -0,0 +1,92 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.morfologik.tagdict.MorfologikTagDictionary;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+  @Test
+  public void testNoLemma() throws Exception {
+    MorfologikTagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("carro"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("NOUN"));
+
+  }
+
+  @Test
+  public void testPOSDictionaryInsensitive() throws Exception {
+    TagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+  }
+
+  @Test
+  public void testPOSDictionarySensitive() throws Exception {
+    TagDictionary dict = createDictionary(true);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("PROP"));
+
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+      throws Exception {
+    return this.createDictionary(caseSensitive, null);
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+      List<String> constant) throws Exception {
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikTagDictionary ml = new MorfologikTagDictionary(
+        Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+
+    return ml;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/78dd579b/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
new file mode 100644
index 0000000..5ac7111
--- /dev/null
+++ b/src/test/resources/dictionaryWithLemma.txt
@@ -0,0 +1,10 @@
+casa	casa	NOUN
+casa	casar	V
+Casa	Casa	PROP
+casinha	casa	NOUN
+casona	casa	NOUN
+menina	menino	NOUN
+menino	menino	NOUN
+menin�o	menino	NOUN
+menininho	menino	NOUN
+carro		NOUN


[14/16] opennlp git commit: OPENNLP-622 Preparing to migrate morfologik-addon to main repository

Posted by co...@apache.org.
OPENNLP-622 Preparing to migrate morfologik-addon to main repository


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/772f31ff
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/772f31ff
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/772f31ff

Branch: refs/heads/trunk
Commit: 772f31ffe764afb675670735be556796781bda8d
Parents: 0cced84
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 18:23:28 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 18:23:28 2016 -0200

----------------------------------------------------------------------
 bin/morfologik-addon                            |  20 --
 bin/morfologik-addon.bat                        |  21 --
 opennlp-morfologik-addon/bin/morfologik-addon   |  20 ++
 .../bin/morfologik-addon.bat                    |  21 ++
 opennlp-morfologik-addon/pom.xml                | 109 +++++++++
 .../src/main/assembly/bin.xml                   |  91 ++++++++
 .../src/main/assembly/src.xml                   |  39 ++++
 .../src/main/bin/morfologik-addon               |  35 +++
 .../src/main/bin/morfologik-addon.bat           |  47 ++++
 .../src/main/bin/opennlp-cp                     |  35 +++
 .../builder/MorfologikDictionayBuilder.java     | 103 +++++++++
 .../java/opennlp/morfologik/cmdline/CLI.java    | 164 +++++++++++++
 .../MorfologikDictionaryBuilderParams.java      |  57 +++++
 .../MorfologikDictionaryBuilderTool.java        |  62 +++++
 .../builder/XMLDictionaryToTableParams.java     |  45 ++++
 .../builder/XMLDictionaryToTableTool.java       | 127 ++++++++++
 .../lemmatizer/MorfologikLemmatizer.java        |  96 ++++++++
 .../tagdict/MorfologikPOSTaggerFactory.java     | 170 ++++++++++++++
 .../tagdict/MorfologikTagDictionary.java        |  90 ++++++++
 .../opennlp/morfologik/util/MorfologikUtil.java |  36 +++
 .../src/main/readme/LICENSE                     | 230 +++++++++++++++++++
 .../src/main/readme/MORFOLOGIK-LICENSE          |  28 +++
 opennlp-morfologik-addon/src/main/readme/NOTICE |  11 +
 .../builder/POSDictionayBuilderTest.java        |  58 +++++
 .../lemmatizer/MorfologikLemmatizerTest.java    |  35 +++
 .../tagdict/MorfologikTagDictionaryTest.java    |  78 +++++++
 .../tagdict/POSTaggerFactoryTest.java           |  88 +++++++
 .../src/test/resources/AnnotatedSentences.txt   | 136 +++++++++++
 .../src/test/resources/dictionaryWithLemma.info |  15 ++
 .../src/test/resources/dictionaryWithLemma.txt  |  11 +
 pom.xml                                         | 109 ---------
 src/main/assembly/bin.xml                       |  91 --------
 src/main/assembly/src.xml                       |  39 ----
 src/main/bin/morfologik-addon                   |  35 ---
 src/main/bin/morfologik-addon.bat               |  47 ----
 src/main/bin/opennlp-cp                         |  35 ---
 .../builder/MorfologikDictionayBuilder.java     | 103 ---------
 .../java/opennlp/morfologik/cmdline/CLI.java    | 164 -------------
 .../MorfologikDictionaryBuilderParams.java      |  57 -----
 .../MorfologikDictionaryBuilderTool.java        |  62 -----
 .../builder/XMLDictionaryToTableParams.java     |  45 ----
 .../builder/XMLDictionaryToTableTool.java       | 127 ----------
 .../lemmatizer/MorfologikLemmatizer.java        |  96 --------
 .../tagdict/MorfologikPOSTaggerFactory.java     | 170 --------------
 .../tagdict/MorfologikTagDictionary.java        |  90 --------
 .../opennlp/morfologik/util/MorfologikUtil.java |  36 ---
 src/main/readme/LICENSE                         | 230 -------------------
 src/main/readme/MORFOLOGIK-LICENSE              |  28 ---
 src/main/readme/NOTICE                          |  11 -
 .../builder/POSDictionayBuilderTest.java        |  58 -----
 .../lemmatizer/MorfologikLemmatizerTest.java    |  35 ---
 .../tagdict/MorfologikTagDictionaryTest.java    |  78 -------
 .../tagdict/POSTaggerFactoryTest.java           |  88 -------
 src/test/resources/AnnotatedSentences.txt       | 136 -----------
 src/test/resources/dictionaryWithLemma.info     |  15 --
 src/test/resources/dictionaryWithLemma.txt      |  11 -
 56 files changed, 2037 insertions(+), 2037 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon b/bin/morfologik-addon
deleted file mode 100755
index ccc635e..0000000
--- a/bin/morfologik-addon
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-
-#   Licensed to the Apache Software Foundation (ASF) under one
-#   or more contributor license agreements.  See the NOTICE file
-#   distributed with this work for additional information
-#   regarding copyright ownership.  The ASF licenses this file
-#   to you under the Apache License, Version 2.0 (the
-#   "License"); you may not use this file except in compliance
-#   with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing,
-#   software distributed under the License is distributed on an
-#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#   KIND, either express or implied.  See the License for the
-#   specific language governing permissions and limitations
-#   under the License.
-
-mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon.bat b/bin/morfologik-addon.bat
deleted file mode 100644
index 26a4778..0000000
--- a/bin/morfologik-addon.bat
+++ /dev/null
@@ -1,21 +0,0 @@
-@ECHO OFF
-
-REM #   Licensed to the Apache Software Foundation (ASF) under one
-REM #   or more contributor license agreements.  See the NOTICE file
-REM #   distributed with this work for additional information
-REM #   regarding copyright ownership.  The ASF licenses this file
-REM #   to you under the Apache License, Version 2.0 (the
-REM #   "License"); you may not use this file except in compliance
-REM #   with the License.  You may obtain a copy of the License at
-REM #
-REM #    http://www.apache.org/licenses/LICENSE-2.0
-REM #
-REM #   Unless required by applicable law or agreed to in writing,
-REM #   software distributed under the License is distributed on an
-REM #   
-REM #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-REM #   KIND, either express or implied.  See the License for the
-REM #   specific language governing permissions and limitations
-REM #   under the License.
-
-mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/bin/morfologik-addon b/opennlp-morfologik-addon/bin/morfologik-addon
new file mode 100755
index 0000000..ccc635e
--- /dev/null
+++ b/opennlp-morfologik-addon/bin/morfologik-addon
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/bin/morfologik-addon.bat b/opennlp-morfologik-addon/bin/morfologik-addon.bat
new file mode 100644
index 0000000..26a4778
--- /dev/null
+++ b/opennlp-morfologik-addon/bin/morfologik-addon.bat
@@ -0,0 +1,21 @@
+@ECHO OFF
+
+REM #   Licensed to the Apache Software Foundation (ASF) under one
+REM #   or more contributor license agreements.  See the NOTICE file
+REM #   distributed with this work for additional information
+REM #   regarding copyright ownership.  The ASF licenses this file
+REM #   to you under the Apache License, Version 2.0 (the
+REM #   "License"); you may not use this file except in compliance
+REM #   with the License.  You may obtain a copy of the License at
+REM #
+REM #    http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM #   Unless required by applicable law or agreed to in writing,
+REM #   software distributed under the License is distributed on an
+REM #   
+REM #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM #   KIND, either express or implied.  See the License for the
+REM #   specific language governing permissions and limitations
+REM #   under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
new file mode 100644
index 0000000..56d0e47
--- /dev/null
+++ b/opennlp-morfologik-addon/pom.xml
@@ -0,0 +1,109 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<groupId>org.apache.opennlp</groupId>
+	<artifactId>morfologik-addon</artifactId>
+	<version>1.0-SNAPSHOT</version>
+	<packaging>jar</packaging>
+	<name>Morfologik Addon</name>
+
+	<url>http://maven.apache.org</url>
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>2.3.2</version>
+				<configuration>
+					<source>1.7</source>
+					<target>1.7</target>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-assembly-plugin</artifactId>
+				<executions>
+					<execution>
+						<id>bundle-project-sources</id>
+						<phase>package</phase>
+						<goals>
+							<goal>single</goal>
+						</goals>
+						<configuration>
+							<descriptors>
+								<descriptor>src/main/assembly/bin.xml</descriptor>
+								<descriptor>src/main/assembly/src.xml</descriptor>
+							</descriptors>
+							<!-- Tar package is only compatible with gnu tar,
+							     many file have more than 100 chars.
+							     Right now only javadoc files are too long.
+							 -->
+							 <tarLongFileMode>gnu</tarLongFileMode>
+							 
+							 <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
+						</configuration>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin> 
+	        <artifactId>maven-antrun-plugin</artifactId> 
+	        <version>1.6</version> 
+	        <executions> 
+	          <execution> 
+	            <id>generate checksums for binary artifacts</id> 
+	            <goals><goal>run</goal></goals> 
+	            <phase>verify</phase> 
+	            <configuration> 
+	              <target> 
+	                <checksum algorithm="sha1" format="MD5SUM"> 
+	                  <fileset dir="${project.build.directory}"> 
+	                    <include name="*.zip" /> 
+	                    <include name="*.gz" /> 
+	                  </fileset> 
+	                </checksum> 
+	                <checksum algorithm="md5" format="MD5SUM"> 
+	                  <fileset dir="${project.build.directory}"> 
+	                    <include name="*.zip" /> 
+	                    <include name="*.gz" /> 
+	                  </fileset> 
+	                </checksum> 
+	              </target> 
+	            </configuration> 
+	          </execution> 
+	        </executions> 
+	      </plugin>
+		</plugins>
+	</build>
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+	</properties>
+
+	<dependencies>
+		<dependency>
+			<groupId>org.carrot2</groupId>
+			<artifactId>morfologik-stemming</artifactId>
+			<version>2.1.0</version>
+			<scope>compile</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.carrot2</groupId>
+			<artifactId>morfologik-tools</artifactId>
+			<version>2.1.0</version>
+			<scope>compile</scope>
+		</dependency>
+
+		<dependency>
+			<groupId>org.apache.opennlp</groupId>
+			<artifactId>opennlp-tools</artifactId>
+			<version>1.6.0</version>
+		</dependency>
+
+		<dependency>
+			<groupId>junit</groupId>
+			<artifactId>junit</artifactId>
+			<version>4.8.1</version>
+			<scope>test</scope>
+		</dependency>
+
+	</dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/bin.xml b/opennlp-morfologik-addon/src/main/assembly/bin.xml
new file mode 100644
index 0000000..ab4f6da
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/assembly/bin.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<assembly>
+  <id>bin</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>zip</format>
+    <format>dir</format>
+  </formats>
+  
+    <includeBaseDirectory>true</includeBaseDirectory>
+	<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
+  
+	<dependencySets>
+		<dependencySet>
+			<scope>runtime</scope>
+			<unpack>false</unpack>
+			<useProjectArtifact>false</useProjectArtifact>
+			<fileMode>644</fileMode>
+			<directoryMode>755</directoryMode>
+			<outputDirectory>lib</outputDirectory>
+			<useTransitiveDependencies>true</useTransitiveDependencies>
+		</dependencySet>
+	</dependencySets>
+	
+	<fileSets>
+	    <fileSet>
+	    	<directory>src/main/readme</directory>
+	    	<outputDirectory></outputDirectory>
+	    	<fileMode>644</fileMode>
+	    	<directoryMode>755</directoryMode>      
+	    </fileSet>
+		
+	    <fileSet>
+	      <directory>.</directory>
+	      <outputDirectory></outputDirectory>
+	      <filtered>true</filtered>
+	      <fileMode>644</fileMode>
+	      <directoryMode>755</directoryMode> 
+	      <includes>
+	        <include>README</include>
+	        <include>RELEASE_NOTES.html</include>
+	      </includes>       
+	    </fileSet>
+	    
+	    <fileSet>
+	      <directory>target</directory>
+	      <outputDirectory></outputDirectory>
+	      <fileMode>644</fileMode>
+	      <directoryMode>755</directoryMode> 
+	      <includes>
+	        <include>issuesFixed/**</include>      
+	      </includes>       
+	    </fileSet>
+	    
+		<fileSet>
+			<directory>src/main/bin</directory>
+			<fileMode>755</fileMode>
+			<directoryMode>755</directoryMode>
+			<outputDirectory>bin</outputDirectory>
+		</fileSet>
+		
+		  <fileSet>
+		    <directory>target</directory>
+		    <outputDirectory>lib</outputDirectory>
+		    <includes>
+		      <include>morfologik-addon-*.jar</include>
+		    </includes>
+		  </fileSet>
+		
+	</fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/assembly/src.xml b/opennlp-morfologik-addon/src/main/assembly/src.xml
new file mode 100644
index 0000000..cdcc9d3
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/assembly/src.xml
@@ -0,0 +1,39 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<assembly>
+  <id>src</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>zip</format>
+  </formats>
+  
+  <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
+  
+  <fileSets>
+    <fileSet>
+      <directory>../</directory>
+      <outputDirectory></outputDirectory>
+      <excludes>
+        <exclude>**/target/**</exclude>
+        <exclude>**/.*/**</exclude>
+        <exclude>**/pom.xml.releaseBackup</exclude>
+        <exclude>**/release.properties</exclude>
+      </excludes>
+    </fileSet>
+  </fileSets>
+</assembly>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon b/opennlp-morfologik-addon/src/main/bin/morfologik-addon
new file mode 100755
index 0000000..9b0faf9
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+# Note:  Do not output anything in this script file, any output
+#        may be inadvertantly placed in any output files if
+#        output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    JAVACMD="$JAVA_HOME/bin/java"
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
new file mode 100644
index 0000000..aeec31f
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/morfologik-addon.bat
@@ -0,0 +1,47 @@
+@ECHO off
+
+REM #   Licensed to the Apache Software Foundation (ASF) under one
+REM #   or more contributor license agreements.  See the NOTICE file
+REM #   distributed with this work for additional information
+REM #   regarding copyright ownership.  The ASF licenses this file
+REM #   to you under the Apache License, Version 2.0 (the
+REM #   "License"); you may not use this file except in compliance
+REM #   with the License.  You may obtain a copy of the License at
+REM #
+REM #    http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM #   Unless required by applicable law or agreed to in writing,
+REM #   software distributed under the License is distributed on an
+REM #   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM #   KIND, either express or implied.  See the License for the
+REM #   specific language governing permissions and limitations
+REM #   under the License.
+
+REM # Note:  Do not output anything in this script file, any output
+REM #        may be inadvertantly placed in any output files if
+REM #        output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+	IF "%JAVA_HOME%" == "" (
+		SET JAVA_CMD=java 
+	) ELSE (
+		REM # Keep JAVA_HOME to short-name without spaces
+		FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+	)
+)
+
+REM #  Should work with Windows XP and greater.  If not, specify the path to where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+	SET OPENNLP_HOME=%~sp0..
+) ELSE (
+	REM # Keep OPENNLP_HOME to short-name without spaces
+	FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+
+REM #  Get the library JAR file name (JIRA OPENNLP-554)
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
+
+%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
+
+ENDLOCAL
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/bin/opennlp-cp b/opennlp-morfologik-addon/src/main/bin/opennlp-cp
new file mode 100755
index 0000000..dff0d12
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/bin/opennlp-cp
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+# Note:  Do not output anything in this script file, any output
+#        may be inadvertantly placed in any output files if
+#        output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    JAVACMD="$JAVA_HOME/bin/java"
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
new file mode 100644
index 0000000..dbbca4d
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Path;
+import java.util.Properties;
+
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.stemming.EncoderType;
+import morfologik.tools.DictCompile;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+  /**
+   * Helper to compile a morphological dictionary automaton.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   * @param overwrite
+   *          Overwrite the output file if it exists.
+   * @param validate
+   *          Validate input to make sure it makes sense.
+   * @param acceptBom
+   *          Accept leading BOM bytes (UTF-8).
+   * @param acceptCr
+   *          Accept CR bytes in input sequences (\r).
+   * @param ignoreEmpty
+   *          Ignore empty lines in the input.
+   * @return the dictionary path
+   * 
+   * @throws Exception
+   */
+  public Path build(Path input, boolean overwrite, boolean validate,
+      boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
+      throws Exception {
+
+    DictCompile compiler = new DictCompile(input, overwrite, validate,
+        acceptBom, acceptCr, ignoreEmpty);
+    compiler.call();
+
+    
+    Path metadataPath = DictionaryMetadata
+        .getExpectedMetadataLocation(input);
+    
+    return metadataPath.resolveSibling(
+        metadataPath.getFileName().toString().replaceAll(
+            "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
+  }
+
+  /**
+   * Helper to compile a morphological dictionary automaton using default
+   * parameters.
+   * 
+   * @param input
+   *          The input file (base,inflected,tag). An associated metadata
+   *          (*.info) file must exist.
+   *          
+   *  @return the dictionary path
+   * 
+   * @throws Exception
+   */
+  public Path build(Path input) throws Exception {
+
+    return build(input, true, true, false, false, false);
+
+  }
+
+  Properties createProperties(Charset encoding, String separator,
+      EncoderType encoderType) throws FileNotFoundException, IOException {
+
+    Properties properties = new Properties();
+    properties.setProperty("fsa.dict.separator", separator);
+    properties.setProperty("fsa.dict.encoding", encoding.name());
+    properties.setProperty("fsa.dict.encoder", encoderType.name());
+
+    return properties;
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
new file mode 100644
index 0000000..f92d178
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+	public static final String CMD = "opennlp-morfologik-addon";
+
+	private static Map<String, CmdLineTool> toolLookupMap;
+
+	static {
+		toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+		List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+		tools.add(new MorfologikDictionaryBuilderTool());
+		tools.add(new XMLDictionaryToTableTool());
+
+		for (CmdLineTool tool : tools) {
+			toolLookupMap.put(tool.getName(), tool);
+		}
+
+		toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+	}
+
+	/**
+	 * @return a set which contains all tool names
+	 */
+	public static Set<String> getToolNames() {
+		return toolLookupMap.keySet();
+	}
+
+	private static void usage() {
+		System.out.print("OpenNLP Morfologik Addon "
+				+ Version.currentVersion().toString() + ". ");
+		System.out.println("Usage: " + CMD + " TOOL");
+		System.out.println("where TOOL is one of:");
+
+		// distance of tool name from line start
+		int numberOfSpaces = -1;
+		for (String toolName : toolLookupMap.keySet()) {
+			if (toolName.length() > numberOfSpaces) {
+				numberOfSpaces = toolName.length();
+			}
+		}
+		numberOfSpaces = numberOfSpaces + 4;
+
+		for (CmdLineTool tool : toolLookupMap.values()) {
+
+			System.out.print("  " + tool.getName());
+
+			for (int i = 0; i < Math.abs(tool.getName().length()
+					- numberOfSpaces); i++) {
+				System.out.print(" ");
+			}
+
+			System.out.println(tool.getShortDescription());
+		}
+
+		System.out
+				.println("All tools print help when invoked with help parameter");
+		System.out
+				.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+	}
+
+
+	  @SuppressWarnings("rawtypes")
+    public static void main(String[] args) {
+
+	    if (args.length == 0) {
+	      usage();
+	      System.exit(0);
+	    }
+
+	    String toolArguments[] = new String[args.length -1];
+	    System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+	    String toolName = args[0];
+
+	    //check for format
+	    String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+	    int idx = toolName.indexOf(".");
+	    if (-1 < idx) {
+	      formatName = toolName.substring(idx + 1);
+	      toolName = toolName.substring(0, idx);
+	    }
+	    CmdLineTool tool = toolLookupMap.get(toolName);
+
+	    try {
+	      if (null == tool) {
+	        throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
+	      }
+
+	      if ((0 == toolArguments.length && tool.hasParams()) ||
+	          0 < toolArguments.length && "help".equals(toolArguments[0])) {
+	          if (tool instanceof TypedCmdLineTool) {
+	            System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
+	          } else if (tool instanceof BasicCmdLineTool) {
+	            System.out.println(tool.getHelp());
+	          }
+
+	          System.exit(0);
+	      }
+
+	      if (tool instanceof TypedCmdLineTool) {
+	        ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+	      } else if (tool instanceof BasicCmdLineTool) {
+	        if (-1 == idx) {
+	          ((BasicCmdLineTool) tool).run(toolArguments);
+	        } else {
+	          throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
+	        }
+	      } else {
+	        throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
+	      }
+	    }
+	    catch (TerminateToolException e) {
+
+	      if (e.getMessage() != null) {
+	        System.err.println(e.getMessage());
+	      }
+
+	      if (e.getCause() != null) {
+	        System.err.println(e.getCause().getMessage());
+	        e.getCause().printStackTrace(System.err);
+	      }
+
+	      System.exit(e.getCode());
+	    }
+	  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
new file mode 100644
index 0000000..5ea2e4f
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
+  File getInputFile();
+  
+  @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptBOM();
+  
+  @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
+  @OptionalParameter(defaultValue="false")
+  Boolean getAcceptCR();
+  
+  @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
+  @OptionalParameter(defaultValue="FSA5")
+  String getFormat();
+  
+  @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getIgnoreEmpty();
+  
+  @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getOverwrite();
+  
+  @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
+  @OptionalParameter(defaultValue="false")
+  Boolean getValidate();
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
new file mode 100644
index 0000000..eb9b51c
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.file.Path;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+  interface Params extends MorfologikDictionaryBuilderParams {
+  }
+
+  public String getShortDescription() {
+    return "builds a binary POS Dictionary using Morfologik";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
+    CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    try {
+      builder.build(dictInFile.toPath(), params.getOverwrite(),
+          params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
+          params.getIgnoreEmpty());
+    } catch (Exception e) {
+      throw new TerminateToolException(-1,
+          "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+    }
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
new file mode 100644
index 0000000..4ee8cd4
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+  File getInputFile();
+
+  @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
+  File getOutputFile();
+
+  @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+  @OptionalParameter(defaultValue=",")
+  String getSeparator();
+  
+  @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+  @OptionalParameter(defaultValue="prefix")
+  String getEncoder();
+  
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
new file mode 100644
index 0000000..0e7f2d5
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Iterator;
+import java.util.Properties;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+  interface Params extends XMLDictionaryToTableParams {
+  }
+
+  private String SEPARATOR;
+
+  public String getShortDescription() {
+    return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+    File dictOutFile = params.getOutputFile();
+    Charset encoding = params.getEncoding();
+    SEPARATOR = params.getSeparator();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+    POSDictionary tagDictionary = null;
+    try {
+      tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+    }
+    Iterator<String> iterator = tagDictionary.iterator();
+
+    try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+        encoding)) {
+      while (iterator.hasNext()) {
+        String word = iterator.next();
+        for (String tag : tagDictionary.getTags(word)) {
+          if(valid(word,tag)) {
+            String entry = createEntry(word, tag);
+            writer.write(entry);
+            writer.newLine();
+          }
+        }
+      }
+      writer.close();
+      System.out.println("Created dictionary: " + dictOutFile.toPath());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing output: "
+          + e.getMessage(), e);
+    }
+    
+    Properties info = new Properties();
+    info.setProperty("fsa.dict.separator", SEPARATOR);
+    info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+    info.setProperty("fsa.dict.encoder", params.getEncoder());
+    
+    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+    
+    try {
+      info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing metadata output: "
+          + e.getMessage(), e);
+    }
+    System.out.println("Created metadata: " + dictOutFile.toPath());
+    
+  }
+
+  private boolean valid(String word, String tag) {
+    if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+      System.out
+          .println("Warn: invalid entry because contains separator - word: "
+              + word + " tag: " + tag);
+      return false;
+    }
+    
+    return true;
+  }
+
+  private String createEntry(String word, String tag) {
+    
+    String entry = "" + SEPARATOR +// base
+        word + SEPARATOR +
+        tag;
+        
+    return entry;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
new file mode 100644
index 0000000..2090ce5
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+  private IStemmer dictLookup;
+  public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+      "NNP", "NP00000"));
+
+  public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
+      IOException {
+    dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
+  }
+
+  private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+    List<WordData> wdList = dictLookup.lookup(word);
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+    for (WordData wd : wdList) {
+      List<String> wordLemmaTags = new ArrayList<String>();
+      wordLemmaTags.add(word);
+      wordLemmaTags.add(wd.getTag().toString());
+      dictMap.put(wordLemmaTags, wd.getStem().toString());
+    }
+    return dictMap;
+  }
+
+  private List<String> getDictKeys(String word, String postag) {
+    List<String> keys = new ArrayList<String>();
+    if (constantTags.contains(postag)) {
+      keys.addAll(Arrays.asList(word, postag));
+    } else {
+      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+    }
+    return keys;
+  }
+
+  private HashMap<List<String>, String> getDictMap(String word, String postag) {
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
+
+    if (constantTags.contains(postag)) {
+      dictMap = this.getLemmaTagsDict(word);
+    } else {
+      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+    }
+    return dictMap;
+  }
+
+  public String lemmatize(String word, String postag) {
+    String lemma = null;
+    List<String> keys = this.getDictKeys(word, postag);
+    HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+    // lookup lemma as value of the map
+    String keyValue = dictMap.get(keys);
+    if (keyValue != null) {
+      lemma = keyValue;
+    } else if (keyValue == null && constantTags.contains(postag)) {
+      lemma = word;
+    } else if (keyValue == null && word.toUpperCase() == word) {
+      lemma = word;
+    } else {
+      lemma = word.toLowerCase();
+    }
+    return lemma;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
new file mode 100644
index 0000000..93d6c61
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Map;
+
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+  private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+  private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+  private static final String MORFOLOGIK_POSDICT = "tagdict."
+      + MORFOLOGIK_POSDICT_SUF;
+  private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+      + MORFOLOGIK_DICT_INFO_SUF;
+
+  private TagDictionary dict;
+
+  private byte[] dictInfo;
+  private byte[] dictData;
+
+  public MorfologikPOSTaggerFactory() {
+  }
+  
+  public TagDictionary createTagDictionary(File dictionary)
+      throws InvalidFormatException, FileNotFoundException, IOException {
+    
+    if(!dictionary.canRead()) {
+      throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
+    }
+    
+    Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
+    
+    if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
+      throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
+    }
+    
+    this.dictData = Files.readAllBytes(dictionary.toPath());
+    this.dictInfo = Files.readAllBytes(dictionaryMeta);
+    
+    return createMorfologikDictionary(dictData, dictInfo);
+    
+  }
+  
+
+  @Override
+  protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+    super.init(ngramDictionary, null);
+    this.dict = posDictionary;
+  }
+
+  @Override
+  public TagDictionary getTagDictionary() {
+    if (this.dict == null) {
+
+      if (artifactProvider != null) {
+        Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+        if (obj != null) {
+          byte[] data = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_POSDICT);
+          byte[] info = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_DICT_INFO);
+
+          try {
+            this.dict = createMorfologikDictionary(data, info);
+          } catch (IllegalArgumentException e) {
+            throw new RuntimeException(
+                "Could not load the dictionary files to Morfologik.", e);
+          } catch (IOException e) {
+            throw new RuntimeException(
+                "IO error while reading the Morfologik dictionary files.", e);
+          }
+        }
+      }
+    }
+
+    return this.dict;
+  }
+
+  @Override
+  public void setTagDictionary(TagDictionary dictionary) {
+    this.dict = dictionary;
+  }
+
+  @Override
+  public TagDictionary createEmptyTagDictionary() {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createTagDictionary(InputStream in)
+      throws InvalidFormatException, IOException {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  @SuppressWarnings("rawtypes")
+  public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+    Map<String, ArtifactSerializer> serializers = super
+        .createArtifactSerializersMap();
+
+    serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+    serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+    return serializers;
+  }
+
+  @Override
+  public Map<String, Object> createArtifactMap() {
+    Map<String, Object> artifactMap = super.createArtifactMap();
+    artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+    artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+    return artifactMap;
+  }
+
+  private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+      throws IOException {
+    morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+        .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
+            info));
+    return new MorfologikTagDictionary(dict);
+  }
+
+  static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+    public byte[] create(InputStream in) throws IOException,
+        InvalidFormatException {
+
+      return ModelUtil.read(in);
+    }
+
+    public void serialize(byte[] artifact, OutputStream out) throws IOException {
+      out.write(artifact);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
new file mode 100644
index 0000000..b34ca2b
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+  private IStemmer dictLookup;
+  private boolean isCaseSensitive;
+
+  /**
+   * Creates a case sensitive {@link MorfologikTagDictionary}
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict)
+      throws IllegalArgumentException, IOException {
+    this(dict, true);
+  }
+
+  /**
+   * Creates MorfologikLemmatizer
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @param caseSensitive
+   *          if true it performs case sensitive lookup
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+      throws IllegalArgumentException, IOException {
+    this.dictLookup = new DictionaryLookup(dict);
+    this.isCaseSensitive = caseSensitive;
+  }
+
+  @Override
+  public String[] getTags(String word) {
+    if (!isCaseSensitive) {
+      word = word.toLowerCase();
+    }
+
+    List<WordData> data = dictLookup.lookup(word);
+    if (data != null && data.size() > 0) {
+      List<String> tags = new ArrayList<String>(data.size());
+      for (int i = 0; i < data.size(); i++) {
+        tags.add(data.get(i).getTag().toString());
+      }
+      if (tags.size() > 0)
+        return tags.toArray(new String[tags.size()]);
+      return null;
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
new file mode 100644
index 0000000..bd4d1a4
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.util;
+
+import java.io.File;
+
+import morfologik.stemming.DictionaryMetadata;
+
+public class MorfologikUtil {
+  
+  public static File getExpectedPropertiesFile(File dictFile) {
+    return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
+        .toFile();
+  }
+  
+  public static File getExpectedPropertiesFile(String dictFile) {
+    File f = new File(dictFile);
+    return getExpectedPropertiesFile(f);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/LICENSE b/opennlp-morfologik-addon/src/main/readme/LICENSE
new file mode 100644
index 0000000..576b4cf
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/LICENSE
@@ -0,0 +1,230 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+The following license applies to the Snowball stemmers:
+        
+        Copyright (c) 2001, Dr Martin Porter
+        Copyright (c) 2002, Richard Boulton
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions are met:
+        
+            * Redistributions of source code must retain the above copyright notice,
+            * this list of conditions and the following disclaimer.
+            * Redistributions in binary form must reproduce the above copyright
+            * notice, this list of conditions and the following disclaimer in the
+            * documentation and/or other materials provided with the distribution.
+            * Neither the name of the copyright holders nor the names of its contributors
+            * may be used to endorse or promote products derived from this software
+            * without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+        AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+        IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+        DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+        FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+        SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+        CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+        OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
new file mode 100644
index 0000000..0554010
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/MORFOLOGIK-LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/readme/NOTICE b/opennlp-morfologik-addon/src/main/readme/NOTICE
new file mode 100644
index 0000000..73fb1d7
--- /dev/null
+++ b/opennlp-morfologik-addon/src/main/readme/NOTICE
@@ -0,0 +1,11 @@
+Apache OpenNLP
+Copyright 2010, 2013 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+The snowball stemmers in
+opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+http://snowball.tartarus.org/

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
new file mode 100644
index 0000000..0a7ba48
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+
+import junit.framework.TestCase;
+import morfologik.stemming.DictionaryMetadata;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+  @Test
+  public void testBuildDictionary() throws Exception {
+    
+    Path output = createMorfologikDictionary();
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
+
+    assertNotNull(ml);
+  }
+  
+  public static Path createMorfologikDictionary() throws Exception {
+    Path tabFilePath = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".txt").toPath();
+    Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
+    
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
+    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
+        "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
+    
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    
+    return builder.build(tabFilePath);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
new file mode 100644
index 0000000..6b7525e
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -0,0 +1,35 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+  @Test
+  public void testLemmatizeInsensitive() throws Exception {
+    DictionaryLemmatizer dict = createDictionary(false);
+
+    assertEquals("casar", dict.lemmatize("casa", "V"));
+    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+  }
+
+  private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+      throws Exception {
+
+    Path output = POSDictionayBuilderTest.createMorfologikDictionary();
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
+
+    return ml;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
new file mode 100644
index 0000000..c6c9e04
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -0,0 +1,78 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+  @Test
+  public void testNoLemma() throws Exception {
+    MorfologikTagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("carro"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("NOUN"));
+
+  }
+
+  @Test
+  public void testPOSDictionaryInsensitive() throws Exception {
+    TagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+  }
+
+  @Test
+  public void testPOSDictionarySensitive() throws Exception {
+    TagDictionary dict = createDictionary(true);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("PROP"));
+
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+      throws Exception {
+    return this.createDictionary(caseSensitive, null);
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+      List<String> constant) throws Exception {
+
+    Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
+    MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
+
+    return ml;
+  }
+
+}


[13/16] opennlp git commit: OPENNLP-622 Preparing to migrate morfologik-addon to main repository

Posted by co...@apache.org.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
new file mode 100644
index 0000000..7341a02
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.*;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelType;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link POSTaggerFactory} class.
+ */
+public class POSTaggerFactoryTest {
+
+  private static ObjectStream<POSSample> createSampleStream()
+      throws IOException {
+    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+        .getResourceAsStream("AnnotatedSentences.txt");
+
+    return new WordTagSampleStream((new InputStreamReader(in)));
+  }
+
+  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+      throws IOException {
+    return POSTaggerME.train("en", createSampleStream(),
+        TrainingParameters.defaultParams(), factory);
+  }
+
+  @Test
+  public void testPOSTaggerWithCustomFactory() throws Exception {
+
+    Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
+    POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
+    TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
+    inFactory.setTagDictionary(inDict);
+
+    POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
+
+    POSTaggerFactory factory = posModel.getFactory();
+    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+    factory = null;
+    
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    posModel.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    POSModel fromSerialized = new POSModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+    
+    assertEquals(2, factory.getTagDictionary().getTags("casa").length);
+  }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
new file mode 100644
index 0000000..b40be87
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/AnnotatedSentences.txt
@@ -0,0 +1,136 @@
+Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
+I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
+So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
+She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
+I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
+
+Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
+Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
+I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
+As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
+The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
+But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
+It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
+She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
+
+Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
+About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
+That_DT sounds_VBZ good_JJ ._.
+So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
+I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
+Thank_VB you_PRP very_RB much_RB !_.
+
+Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
+I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
+Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
+Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
+
+My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
+We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
+Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
+As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
+But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
+Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
+
+I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
+I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
+I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
+I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
+
+Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
+As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
+The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
+Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
+
+An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
+Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
+She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
+But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
+So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
+It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
+She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
+
+Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
+They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
+Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
+No,_NNP that_DT 's_VBZ faked_VBN ._.
+But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
+I_PRP got_VBD them_PRP from_IN Hamburg._NNP
+Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
+It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
+I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
+Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
+Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
+I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
+His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
+
+Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
+My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
+How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
+His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
+I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
+I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
+Is_VBZ that_DT right_NN ?_.
+Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
+So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
+Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
+Bye_NNP !_.
+
+On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
+The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
+Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
+But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
+Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
+
+Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
+I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
+I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
+An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
+Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
+The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
+He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
+
+Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
+He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
+Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
+Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
+It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
+
+When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
+He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
+One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
+So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
+She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
+Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
+
+On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
+The_DT driver_NN got_VBD badly_RB injured_VBN ._.
+Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
+A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
+He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
+He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
+The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
+Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
+
+Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
+Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
+Yeah_UH for_IN sure_JJ ._.
+How_WRB did_VBD you_PRP know_VB that_DT ?_.
+I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
+Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
+Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
+Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
+But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
+I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
+The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
+I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
+Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
+
+My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
+When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
+My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
+But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
+So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
+Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
+We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
+I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
new file mode 100644
index 0000000..ad5fe8d
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.info
@@ -0,0 +1,15 @@
+#
+# REQUIRED PROPERTIES
+#
+
+# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
+fsa.dict.separator=,
+
+# The charset in which the input is encoded. UTF-8 is strongly recommended.
+fsa.dict.encoding=UTF-8
+
+# The type of lemma-inflected form encoding compression that precedes automaton
+# construction. Allowed values: [suffix, infix, prefix, none].
+# Details are in Daciuk's paper and in the code. 
+# Leave at 'prefix' if not sure.
+fsa.dict.encoder=prefix
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
new file mode 100644
index 0000000..09d39e3
--- /dev/null
+++ b/opennlp-morfologik-addon/src/test/resources/dictionaryWithLemma.txt
@@ -0,0 +1,11 @@
+casa,casa,NOUN
+casar,casa,V
+casar,casar,V-INF
+Casa,Casa,PROP
+casa,casinha,NOUN
+casa,casona,NOUN
+menino,menina,NOUN
+menino,menino,NOUN
+menino,menin�o,NOUN
+menino,menininho,NOUN
+carro,carro,NOUN
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
deleted file mode 100644
index 56d0e47..0000000
--- a/pom.xml
+++ /dev/null
@@ -1,109 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-	<modelVersion>4.0.0</modelVersion>
-
-	<groupId>org.apache.opennlp</groupId>
-	<artifactId>morfologik-addon</artifactId>
-	<version>1.0-SNAPSHOT</version>
-	<packaging>jar</packaging>
-	<name>Morfologik Addon</name>
-
-	<url>http://maven.apache.org</url>
-	<build>
-		<plugins>
-			<plugin>
-				<groupId>org.apache.maven.plugins</groupId>
-				<artifactId>maven-compiler-plugin</artifactId>
-				<version>2.3.2</version>
-				<configuration>
-					<source>1.7</source>
-					<target>1.7</target>
-				</configuration>
-			</plugin>
-			<plugin>
-				<artifactId>maven-assembly-plugin</artifactId>
-				<executions>
-					<execution>
-						<id>bundle-project-sources</id>
-						<phase>package</phase>
-						<goals>
-							<goal>single</goal>
-						</goals>
-						<configuration>
-							<descriptors>
-								<descriptor>src/main/assembly/bin.xml</descriptor>
-								<descriptor>src/main/assembly/src.xml</descriptor>
-							</descriptors>
-							<!-- Tar package is only compatible with gnu tar,
-							     many file have more than 100 chars.
-							     Right now only javadoc files are too long.
-							 -->
-							 <tarLongFileMode>gnu</tarLongFileMode>
-							 
-							 <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
-						</configuration>
-					</execution>
-				</executions>
-			</plugin>
-			<plugin> 
-	        <artifactId>maven-antrun-plugin</artifactId> 
-	        <version>1.6</version> 
-	        <executions> 
-	          <execution> 
-	            <id>generate checksums for binary artifacts</id> 
-	            <goals><goal>run</goal></goals> 
-	            <phase>verify</phase> 
-	            <configuration> 
-	              <target> 
-	                <checksum algorithm="sha1" format="MD5SUM"> 
-	                  <fileset dir="${project.build.directory}"> 
-	                    <include name="*.zip" /> 
-	                    <include name="*.gz" /> 
-	                  </fileset> 
-	                </checksum> 
-	                <checksum algorithm="md5" format="MD5SUM"> 
-	                  <fileset dir="${project.build.directory}"> 
-	                    <include name="*.zip" /> 
-	                    <include name="*.gz" /> 
-	                  </fileset> 
-	                </checksum> 
-	              </target> 
-	            </configuration> 
-	          </execution> 
-	        </executions> 
-	      </plugin>
-		</plugins>
-	</build>
-	<properties>
-		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-	</properties>
-
-	<dependencies>
-		<dependency>
-			<groupId>org.carrot2</groupId>
-			<artifactId>morfologik-stemming</artifactId>
-			<version>2.1.0</version>
-			<scope>compile</scope>
-		</dependency>
-		<dependency>
-			<groupId>org.carrot2</groupId>
-			<artifactId>morfologik-tools</artifactId>
-			<version>2.1.0</version>
-			<scope>compile</scope>
-		</dependency>
-
-		<dependency>
-			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
-			<version>1.6.0</version>
-		</dependency>
-
-		<dependency>
-			<groupId>junit</groupId>
-			<artifactId>junit</artifactId>
-			<version>4.8.1</version>
-			<scope>test</scope>
-		</dependency>
-
-	</dependencies>
-</project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
deleted file mode 100644
index ab4f6da..0000000
--- a/src/main/assembly/bin.xml
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.    
--->
-
-<assembly>
-  <id>bin</id>
-  <formats>
-    <format>tar.gz</format>
-    <format>zip</format>
-    <format>dir</format>
-  </formats>
-  
-    <includeBaseDirectory>true</includeBaseDirectory>
-	<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
-  
-	<dependencySets>
-		<dependencySet>
-			<scope>runtime</scope>
-			<unpack>false</unpack>
-			<useProjectArtifact>false</useProjectArtifact>
-			<fileMode>644</fileMode>
-			<directoryMode>755</directoryMode>
-			<outputDirectory>lib</outputDirectory>
-			<useTransitiveDependencies>true</useTransitiveDependencies>
-		</dependencySet>
-	</dependencySets>
-	
-	<fileSets>
-	    <fileSet>
-	    	<directory>src/main/readme</directory>
-	    	<outputDirectory></outputDirectory>
-	    	<fileMode>644</fileMode>
-	    	<directoryMode>755</directoryMode>      
-	    </fileSet>
-		
-	    <fileSet>
-	      <directory>.</directory>
-	      <outputDirectory></outputDirectory>
-	      <filtered>true</filtered>
-	      <fileMode>644</fileMode>
-	      <directoryMode>755</directoryMode> 
-	      <includes>
-	        <include>README</include>
-	        <include>RELEASE_NOTES.html</include>
-	      </includes>       
-	    </fileSet>
-	    
-	    <fileSet>
-	      <directory>target</directory>
-	      <outputDirectory></outputDirectory>
-	      <fileMode>644</fileMode>
-	      <directoryMode>755</directoryMode> 
-	      <includes>
-	        <include>issuesFixed/**</include>      
-	      </includes>       
-	    </fileSet>
-	    
-		<fileSet>
-			<directory>src/main/bin</directory>
-			<fileMode>755</fileMode>
-			<directoryMode>755</directoryMode>
-			<outputDirectory>bin</outputDirectory>
-		</fileSet>
-		
-		  <fileSet>
-		    <directory>target</directory>
-		    <outputDirectory>lib</outputDirectory>
-		    <includes>
-		      <include>morfologik-addon-*.jar</include>
-		    </includes>
-		  </fileSet>
-		
-	</fileSets>
-</assembly>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/src.xml b/src/main/assembly/src.xml
deleted file mode 100644
index cdcc9d3..0000000
--- a/src/main/assembly/src.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<assembly>
-  <id>src</id>
-  <formats>
-    <format>tar.gz</format>
-    <format>zip</format>
-  </formats>
-  
-  <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
-  
-  <fileSets>
-    <fileSet>
-      <directory>../</directory>
-      <outputDirectory></outputDirectory>
-      <excludes>
-        <exclude>**/target/**</exclude>
-        <exclude>**/.*/**</exclude>
-        <exclude>**/pom.xml.releaseBackup</exclude>
-        <exclude>**/release.properties</exclude>
-      </excludes>
-    </fileSet>
-  </fileSets>
-</assembly>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
deleted file mode 100755
index 9b0faf9..0000000
--- a/src/main/bin/morfologik-addon
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-
-#   Licensed to the Apache Software Foundation (ASF) under one
-#   or more contributor license agreements.  See the NOTICE file
-#   distributed with this work for additional information
-#   regarding copyright ownership.  The ASF licenses this file
-#   to you under the Apache License, Version 2.0 (the
-#   "License"); you may not use this file except in compliance
-#   with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing,
-#   software distributed under the License is distributed on an
-#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#   KIND, either express or implied.  See the License for the
-#   specific language governing permissions and limitations
-#   under the License.
-
-# Note:  Do not output anything in this script file, any output
-#        may be inadvertantly placed in any output files if
-#        output redirection is used.
-
-if [ -z "$JAVACMD" ] ; then
-  if [ -n "$JAVA_HOME"  ] ; then
-    JAVACMD="$JAVA_HOME/bin/java"
-  else
-    JAVACMD="`which java`"
-  fi
-fi
-
-# Might fail if $0 is a link
-OPENNLP_HOME=`dirname "$0"`/..
-
-$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
deleted file mode 100644
index aeec31f..0000000
--- a/src/main/bin/morfologik-addon.bat
+++ /dev/null
@@ -1,47 +0,0 @@
-@ECHO off
-
-REM #   Licensed to the Apache Software Foundation (ASF) under one
-REM #   or more contributor license agreements.  See the NOTICE file
-REM #   distributed with this work for additional information
-REM #   regarding copyright ownership.  The ASF licenses this file
-REM #   to you under the Apache License, Version 2.0 (the
-REM #   "License"); you may not use this file except in compliance
-REM #   with the License.  You may obtain a copy of the License at
-REM #
-REM #    http://www.apache.org/licenses/LICENSE-2.0
-REM #
-REM #   Unless required by applicable law or agreed to in writing,
-REM #   software distributed under the License is distributed on an
-REM #   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-REM #   KIND, either express or implied.  See the License for the
-REM #   specific language governing permissions and limitations
-REM #   under the License.
-
-REM # Note:  Do not output anything in this script file, any output
-REM #        may be inadvertantly placed in any output files if
-REM #        output redirection is used.
-SETLOCAL
-
-IF "%JAVA_CMD%" == "" (
-	IF "%JAVA_HOME%" == "" (
-		SET JAVA_CMD=java 
-	) ELSE (
-		REM # Keep JAVA_HOME to short-name without spaces
-		FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
-	)
-)
-
-REM #  Should work with Windows XP and greater.  If not, specify the path to where it is installed.
-IF "%OPENNLP_HOME%" == "" (
-	SET OPENNLP_HOME=%~sp0..
-) ELSE (
-	REM # Keep OPENNLP_HOME to short-name without spaces
-	FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
-)
-
-REM #  Get the library JAR file name (JIRA OPENNLP-554)
-FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
-
-%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
-
-ENDLOCAL
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/src/main/bin/opennlp-cp b/src/main/bin/opennlp-cp
deleted file mode 100755
index dff0d12..0000000
--- a/src/main/bin/opennlp-cp
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-
-#   Licensed to the Apache Software Foundation (ASF) under one
-#   or more contributor license agreements.  See the NOTICE file
-#   distributed with this work for additional information
-#   regarding copyright ownership.  The ASF licenses this file
-#   to you under the Apache License, Version 2.0 (the
-#   "License"); you may not use this file except in compliance
-#   with the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#   Unless required by applicable law or agreed to in writing,
-#   software distributed under the License is distributed on an
-#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#   KIND, either express or implied.  See the License for the
-#   specific language governing permissions and limitations
-#   under the License.
-
-# Note:  Do not output anything in this script file, any output
-#        may be inadvertantly placed in any output files if
-#        output redirection is used.
-
-if [ -z "$JAVACMD" ] ; then
-  if [ -n "$JAVA_HOME"  ] ; then
-    JAVACMD="$JAVA_HOME/bin/java"
-  else
-    JAVACMD="`which java`"
-  fi
-fi
-
-# Might fail if $0 is a link
-OPENNLP_HOME=`dirname "$0"`/..
-
-$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
deleted file mode 100644
index dbbca4d..0000000
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.builder;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.nio.file.Path;
-import java.util.Properties;
-
-import morfologik.stemming.DictionaryMetadata;
-import morfologik.stemming.EncoderType;
-import morfologik.tools.DictCompile;
-
-/**
- * Utility class to build Morfologik dictionaries from a tab separated values
- * file. The first column is the word, the second its lemma and the third a POS
- * tag. If there is no lemma information leave the second column empty.
- */
-public class MorfologikDictionayBuilder {
-
-  /**
-   * Helper to compile a morphological dictionary automaton.
-   * 
-   * @param input
-   *          The input file (base,inflected,tag). An associated metadata
-   *          (*.info) file must exist.
-   * @param overwrite
-   *          Overwrite the output file if it exists.
-   * @param validate
-   *          Validate input to make sure it makes sense.
-   * @param acceptBom
-   *          Accept leading BOM bytes (UTF-8).
-   * @param acceptCr
-   *          Accept CR bytes in input sequences (\r).
-   * @param ignoreEmpty
-   *          Ignore empty lines in the input.
-   * @return the dictionary path
-   * 
-   * @throws Exception
-   */
-  public Path build(Path input, boolean overwrite, boolean validate,
-      boolean acceptBom, boolean acceptCr, boolean ignoreEmpty)
-      throws Exception {
-
-    DictCompile compiler = new DictCompile(input, overwrite, validate,
-        acceptBom, acceptCr, ignoreEmpty);
-    compiler.call();
-
-    
-    Path metadataPath = DictionaryMetadata
-        .getExpectedMetadataLocation(input);
-    
-    return metadataPath.resolveSibling(
-        metadataPath.getFileName().toString().replaceAll(
-            "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict"));
-  }
-
-  /**
-   * Helper to compile a morphological dictionary automaton using default
-   * parameters.
-   * 
-   * @param input
-   *          The input file (base,inflected,tag). An associated metadata
-   *          (*.info) file must exist.
-   *          
-   *  @return the dictionary path
-   * 
-   * @throws Exception
-   */
-  public Path build(Path input) throws Exception {
-
-    return build(input, true, true, false, false, false);
-
-  }
-
-  Properties createProperties(Charset encoding, String separator,
-      EncoderType encoderType) throws FileNotFoundException, IOException {
-
-    Properties properties = new Properties();
-    properties.setProperty("fsa.dict.separator", separator);
-    properties.setProperty("fsa.dict.encoding", encoding.name());
-    properties.setProperty("fsa.dict.encoder", encoderType.name());
-
-    return properties;
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
deleted file mode 100644
index f92d178..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/CLI.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline;
-
-import java.util.Collections;
-import java.util.LinkedHashMap;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
-import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineTool;
-import opennlp.tools.cmdline.StreamFactoryRegistry;
-import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.cmdline.TypedCmdLineTool;
-import opennlp.tools.util.Version;
-
-public final class CLI {
-
-	public static final String CMD = "opennlp-morfologik-addon";
-
-	private static Map<String, CmdLineTool> toolLookupMap;
-
-	static {
-		toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
-
-		List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
-
-		tools.add(new MorfologikDictionaryBuilderTool());
-		tools.add(new XMLDictionaryToTableTool());
-
-		for (CmdLineTool tool : tools) {
-			toolLookupMap.put(tool.getName(), tool);
-		}
-
-		toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
-	}
-
-	/**
-	 * @return a set which contains all tool names
-	 */
-	public static Set<String> getToolNames() {
-		return toolLookupMap.keySet();
-	}
-
-	private static void usage() {
-		System.out.print("OpenNLP Morfologik Addon "
-				+ Version.currentVersion().toString() + ". ");
-		System.out.println("Usage: " + CMD + " TOOL");
-		System.out.println("where TOOL is one of:");
-
-		// distance of tool name from line start
-		int numberOfSpaces = -1;
-		for (String toolName : toolLookupMap.keySet()) {
-			if (toolName.length() > numberOfSpaces) {
-				numberOfSpaces = toolName.length();
-			}
-		}
-		numberOfSpaces = numberOfSpaces + 4;
-
-		for (CmdLineTool tool : toolLookupMap.values()) {
-
-			System.out.print("  " + tool.getName());
-
-			for (int i = 0; i < Math.abs(tool.getName().length()
-					- numberOfSpaces); i++) {
-				System.out.print(" ");
-			}
-
-			System.out.println(tool.getShortDescription());
-		}
-
-		System.out
-				.println("All tools print help when invoked with help parameter");
-		System.out
-				.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
-	}
-
-
-	  @SuppressWarnings("rawtypes")
-    public static void main(String[] args) {
-
-	    if (args.length == 0) {
-	      usage();
-	      System.exit(0);
-	    }
-
-	    String toolArguments[] = new String[args.length -1];
-	    System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
-
-	    String toolName = args[0];
-
-	    //check for format
-	    String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
-	    int idx = toolName.indexOf(".");
-	    if (-1 < idx) {
-	      formatName = toolName.substring(idx + 1);
-	      toolName = toolName.substring(0, idx);
-	    }
-	    CmdLineTool tool = toolLookupMap.get(toolName);
-
-	    try {
-	      if (null == tool) {
-	        throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
-	      }
-
-	      if ((0 == toolArguments.length && tool.hasParams()) ||
-	          0 < toolArguments.length && "help".equals(toolArguments[0])) {
-	          if (tool instanceof TypedCmdLineTool) {
-	            System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
-	          } else if (tool instanceof BasicCmdLineTool) {
-	            System.out.println(tool.getHelp());
-	          }
-
-	          System.exit(0);
-	      }
-
-	      if (tool instanceof TypedCmdLineTool) {
-	        ((TypedCmdLineTool) tool).run(formatName, toolArguments);
-	      } else if (tool instanceof BasicCmdLineTool) {
-	        if (-1 == idx) {
-	          ((BasicCmdLineTool) tool).run(toolArguments);
-	        } else {
-	          throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
-	        }
-	      } else {
-	        throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
-	      }
-	    }
-	    catch (TerminateToolException e) {
-
-	      if (e.getMessage() != null) {
-	        System.err.println(e.getMessage());
-	      }
-
-	      if (e.getCause() != null) {
-	        System.err.println(e.getCause().getMessage());
-	        e.getCause().printStackTrace(System.err);
-	      }
-
-	      System.exit(e.getCode());
-	    }
-	  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
deleted file mode 100644
index 5ea2e4f..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-
-import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.params.EncodingParameter;
-
-/**
- * Params for Dictionary tools.
- */
-interface MorfologikDictionaryBuilderParams extends EncodingParameter {
-
-  @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.")
-  File getInputFile();
-  
-  @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).")
-  @OptionalParameter(defaultValue="false")
-  Boolean getAcceptBOM();
-  
-  @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).")
-  @OptionalParameter(defaultValue="false")
-  Boolean getAcceptCR();
-  
-  @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.")
-  @OptionalParameter(defaultValue="FSA5")
-  String getFormat();
-  
-  @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.")
-  @OptionalParameter(defaultValue="false")
-  Boolean getIgnoreEmpty();
-  
-  @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.")
-  @OptionalParameter(defaultValue="false")
-  Boolean getOverwrite();
-  
-  @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.")
-  @OptionalParameter(defaultValue="false")
-  Boolean getValidate();
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
deleted file mode 100644
index eb9b51c..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-import java.nio.file.Path;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.morfologik.builder.MorfologikDictionayBuilder;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.cmdline.TerminateToolException;
-
-public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
-
-  interface Params extends MorfologikDictionaryBuilderParams {
-  }
-
-  public String getShortDescription() {
-    return "builds a binary POS Dictionary using Morfologik";
-  }
-
-  public String getHelp() {
-    return getBasicHelp(Params.class);
-  }
-
-  public void run(String[] args) {
-    Params params = validateAndParseParams(args, Params.class);
-
-    File dictInFile = params.getInputFile();
-
-    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
-    Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath());
-    CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile());
-
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    try {
-      builder.build(dictInFile.toPath(), params.getOverwrite(),
-          params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(),
-          params.getIgnoreEmpty());
-    } catch (Exception e) {
-      throw new TerminateToolException(-1,
-          "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
-    }
-
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
deleted file mode 100644
index 4ee8cd4..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.File;
-
-import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
-import opennlp.tools.cmdline.params.EncodingParameter;
-
-/**
- * Params for Dictionary tools.
- */
-interface XMLDictionaryToTableParams extends EncodingParameter {
-
-  @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
-  File getInputFile();
-
-  @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
-  File getOutputFile();
-
-  @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
-  @OptionalParameter(defaultValue=",")
-  String getSeparator();
-  
-  @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
-  @OptionalParameter(defaultValue="prefix")
-  String getEncoder();
-  
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
deleted file mode 100644
index 0e7f2d5..0000000
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.cmdline.builder;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Iterator;
-import java.util.Properties;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.tools.cmdline.BasicCmdLineTool;
-import opennlp.tools.cmdline.CmdLineUtil;
-import opennlp.tools.cmdline.TerminateToolException;
-import opennlp.tools.postag.POSDictionary;
-
-public class XMLDictionaryToTableTool extends BasicCmdLineTool {
-
-  interface Params extends XMLDictionaryToTableParams {
-  }
-
-  private String SEPARATOR;
-
-  public String getShortDescription() {
-    return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
-  }
-
-  public String getHelp() {
-    return getBasicHelp(Params.class);
-  }
-
-  public void run(String[] args) {
-    Params params = validateAndParseParams(args, Params.class);
-
-    File dictInFile = params.getInputFile();
-    File dictOutFile = params.getOutputFile();
-    Charset encoding = params.getEncoding();
-    SEPARATOR = params.getSeparator();
-
-    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
-    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
-
-    POSDictionary tagDictionary = null;
-    try {
-      tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
-    } catch (IOException e) {
-      throw new TerminateToolException(-1,
-          "Error while loading XML POS Dictionay: " + e.getMessage(), e);
-    }
-    Iterator<String> iterator = tagDictionary.iterator();
-
-    try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
-        encoding)) {
-      while (iterator.hasNext()) {
-        String word = iterator.next();
-        for (String tag : tagDictionary.getTags(word)) {
-          if(valid(word,tag)) {
-            String entry = createEntry(word, tag);
-            writer.write(entry);
-            writer.newLine();
-          }
-        }
-      }
-      writer.close();
-      System.out.println("Created dictionary: " + dictOutFile.toPath());
-    } catch (IOException e) {
-      throw new TerminateToolException(-1, "Error while writing output: "
-          + e.getMessage(), e);
-    }
-    
-    Properties info = new Properties();
-    info.setProperty("fsa.dict.separator", SEPARATOR);
-    info.setProperty("fsa.dict.encoding", params.getEncoding().name());
-    info.setProperty("fsa.dict.encoder", params.getEncoder());
-    
-    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
-    
-    try {
-      info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
-    } catch (IOException e) {
-      throw new TerminateToolException(-1, "Error while writing metadata output: "
-          + e.getMessage(), e);
-    }
-    System.out.println("Created metadata: " + dictOutFile.toPath());
-    
-  }
-
-  private boolean valid(String word, String tag) {
-    if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
-      System.out
-          .println("Warn: invalid entry because contains separator - word: "
-              + word + " tag: " + tag);
-      return false;
-    }
-    
-    return true;
-  }
-
-  private String createEntry(String word, String tag) {
-    
-    String entry = "" + SEPARATOR +// base
-        word + SEPARATOR +
-        tag;
-        
-    return entry;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
deleted file mode 100644
index 2090ce5..0000000
--- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.lemmatizer;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import morfologik.stemming.Dictionary;
-import morfologik.stemming.DictionaryLookup;
-import morfologik.stemming.IStemmer;
-import morfologik.stemming.WordData;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
-
-public class MorfologikLemmatizer implements DictionaryLemmatizer {
-
-  private IStemmer dictLookup;
-  public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
-      "NNP", "NP00000"));
-
-  public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
-      IOException {
-    dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
-  }
-
-  private HashMap<List<String>, String> getLemmaTagsDict(String word) {
-    List<WordData> wdList = dictLookup.lookup(word);
-    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
-    for (WordData wd : wdList) {
-      List<String> wordLemmaTags = new ArrayList<String>();
-      wordLemmaTags.add(word);
-      wordLemmaTags.add(wd.getTag().toString());
-      dictMap.put(wordLemmaTags, wd.getStem().toString());
-    }
-    return dictMap;
-  }
-
-  private List<String> getDictKeys(String word, String postag) {
-    List<String> keys = new ArrayList<String>();
-    if (constantTags.contains(postag)) {
-      keys.addAll(Arrays.asList(word, postag));
-    } else {
-      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
-    }
-    return keys;
-  }
-
-  private HashMap<List<String>, String> getDictMap(String word, String postag) {
-    HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>();
-
-    if (constantTags.contains(postag)) {
-      dictMap = this.getLemmaTagsDict(word);
-    } else {
-      dictMap = this.getLemmaTagsDict(word.toLowerCase());
-    }
-    return dictMap;
-  }
-
-  public String lemmatize(String word, String postag) {
-    String lemma = null;
-    List<String> keys = this.getDictKeys(word, postag);
-    HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
-    // lookup lemma as value of the map
-    String keyValue = dictMap.get(keys);
-    if (keyValue != null) {
-      lemma = keyValue;
-    } else if (keyValue == null && constantTags.contains(postag)) {
-      lemma = word;
-    } else if (keyValue == null && word.toUpperCase() == word) {
-      lemma = word;
-    } else {
-      lemma = word.toLowerCase();
-    }
-    return lemma;
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
deleted file mode 100644
index 93d6c61..0000000
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Map;
-
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.tools.dictionary.Dictionary;
-import opennlp.tools.postag.POSTaggerFactory;
-import opennlp.tools.postag.TagDictionary;
-import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.model.ArtifactSerializer;
-import opennlp.tools.util.model.ModelUtil;
-
-public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
-
-  private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
-  private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
-
-  private static final String MORFOLOGIK_POSDICT = "tagdict."
-      + MORFOLOGIK_POSDICT_SUF;
-  private static final String MORFOLOGIK_DICT_INFO = "tagdict."
-      + MORFOLOGIK_DICT_INFO_SUF;
-
-  private TagDictionary dict;
-
-  private byte[] dictInfo;
-  private byte[] dictData;
-
-  public MorfologikPOSTaggerFactory() {
-  }
-  
-  public TagDictionary createTagDictionary(File dictionary)
-      throws InvalidFormatException, FileNotFoundException, IOException {
-    
-    if(!dictionary.canRead()) {
-      throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
-    }
-    
-    Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
-    
-    if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
-      throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
-    }
-    
-    this.dictData = Files.readAllBytes(dictionary.toPath());
-    this.dictInfo = Files.readAllBytes(dictionaryMeta);
-    
-    return createMorfologikDictionary(dictData, dictInfo);
-    
-  }
-  
-
-  @Override
-  protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
-    super.init(ngramDictionary, null);
-    this.dict = posDictionary;
-  }
-
-  @Override
-  public TagDictionary getTagDictionary() {
-    if (this.dict == null) {
-
-      if (artifactProvider != null) {
-        Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
-        if (obj != null) {
-          byte[] data = (byte[]) artifactProvider
-              .getArtifact(MORFOLOGIK_POSDICT);
-          byte[] info = (byte[]) artifactProvider
-              .getArtifact(MORFOLOGIK_DICT_INFO);
-
-          try {
-            this.dict = createMorfologikDictionary(data, info);
-          } catch (IllegalArgumentException e) {
-            throw new RuntimeException(
-                "Could not load the dictionary files to Morfologik.", e);
-          } catch (IOException e) {
-            throw new RuntimeException(
-                "IO error while reading the Morfologik dictionary files.", e);
-          }
-        }
-      }
-    }
-
-    return this.dict;
-  }
-
-  @Override
-  public void setTagDictionary(TagDictionary dictionary) {
-    this.dict = dictionary;
-  }
-
-  @Override
-  public TagDictionary createEmptyTagDictionary() {
-    throw new UnsupportedOperationException(
-        "Morfologik POS Tagger factory does not support this operation");
-  }
-
-  @Override
-  public TagDictionary createTagDictionary(InputStream in)
-      throws InvalidFormatException, IOException {
-    throw new UnsupportedOperationException(
-        "Morfologik POS Tagger factory does not support this operation");
-  }
-
-  @Override
-  @SuppressWarnings("rawtypes")
-  public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
-    Map<String, ArtifactSerializer> serializers = super
-        .createArtifactSerializersMap();
-
-    serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
-    serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
-
-    return serializers;
-  }
-
-  @Override
-  public Map<String, Object> createArtifactMap() {
-    Map<String, Object> artifactMap = super.createArtifactMap();
-    artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
-    artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
-    return artifactMap;
-  }
-
-  private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
-      throws IOException {
-    morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
-        .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
-            info));
-    return new MorfologikTagDictionary(dict);
-  }
-
-  static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
-
-    public byte[] create(InputStream in) throws IOException,
-        InvalidFormatException {
-
-      return ModelUtil.read(in);
-    }
-
-    public void serialize(byte[] artifact, OutputStream out) throws IOException {
-      out.write(artifact);
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
deleted file mode 100644
index b34ca2b..0000000
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import morfologik.stemming.Dictionary;
-import morfologik.stemming.DictionaryLookup;
-import morfologik.stemming.IStemmer;
-import morfologik.stemming.WordData;
-import opennlp.tools.postag.TagDictionary;
-
-/**
- * A POS Tagger dictionary implementation based on Morfologik binary
- * dictionaries
- */
-public class MorfologikTagDictionary implements TagDictionary {
-
-  private IStemmer dictLookup;
-  private boolean isCaseSensitive;
-
-  /**
-   * Creates a case sensitive {@link MorfologikTagDictionary}
-   *
-   * @param dict
-   *          a Morfologik FSA dictionary
-   * @throws IllegalArgumentException
-   *           if FSA's root node cannot be acquired (dictionary is empty).
-   * @throws IOException
-   *           could not read dictionary from dictURL
-   */
-  public MorfologikTagDictionary(Dictionary dict)
-      throws IllegalArgumentException, IOException {
-    this(dict, true);
-  }
-
-  /**
-   * Creates MorfologikLemmatizer
-   *
-   * @param dict
-   *          a Morfologik FSA dictionary
-   * @param caseSensitive
-   *          if true it performs case sensitive lookup
-   * @throws IllegalArgumentException
-   *           if FSA's root node cannot be acquired (dictionary is empty).
-   * @throws IOException
-   *           could not read dictionary from dictURL
-   */
-  public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
-      throws IllegalArgumentException, IOException {
-    this.dictLookup = new DictionaryLookup(dict);
-    this.isCaseSensitive = caseSensitive;
-  }
-
-  @Override
-  public String[] getTags(String word) {
-    if (!isCaseSensitive) {
-      word = word.toLowerCase();
-    }
-
-    List<WordData> data = dictLookup.lookup(word);
-    if (data != null && data.size() > 0) {
-      List<String> tags = new ArrayList<String>(data.size());
-      for (int i = 0; i < data.size(); i++) {
-        tags.add(data.get(i).getTag().toString());
-      }
-      if (tags.size() > 0)
-        return tags.toArray(new String[tags.size()]);
-      return null;
-    }
-    return null;
-  }
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
deleted file mode 100644
index bd4d1a4..0000000
--- a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.util;
-
-import java.io.File;
-
-import morfologik.stemming.DictionaryMetadata;
-
-public class MorfologikUtil {
-  
-  public static File getExpectedPropertiesFile(File dictFile) {
-    return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
-        .toFile();
-  }
-  
-  public static File getExpectedPropertiesFile(String dictFile) {
-    File f = new File(dictFile);
-    return getExpectedPropertiesFile(f);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/LICENSE b/src/main/readme/LICENSE
deleted file mode 100644
index 576b4cf..0000000
--- a/src/main/readme/LICENSE
+++ /dev/null
@@ -1,230 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
-The following license applies to the Snowball stemmers:
-        
-        Copyright (c) 2001, Dr Martin Porter
-        Copyright (c) 2002, Richard Boulton
-        All rights reserved.
-        
-        Redistribution and use in source and binary forms, with or without
-        modification, are permitted provided that the following conditions are met:
-        
-            * Redistributions of source code must retain the above copyright notice,
-            * this list of conditions and the following disclaimer.
-            * Redistributions in binary form must reproduce the above copyright
-            * notice, this list of conditions and the following disclaimer in the
-            * documentation and/or other materials provided with the distribution.
-            * Neither the name of the copyright holders nor the names of its contributors
-            * may be used to endorse or promote products derived from this software
-            * without specific prior written permission.
-        
-        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-        AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-        IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-        DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-        FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-        SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-        CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-        OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/MORFOLOGIK-LICENSE b/src/main/readme/MORFOLOGIK-LICENSE
deleted file mode 100644
index 0554010..0000000
--- a/src/main/readme/MORFOLOGIK-LICENSE
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright (c) 2006 Dawid Weiss
-Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification, 
-are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice, 
-    this list of conditions and the following disclaimer.
-    
-    * Redistributions in binary form must reproduce the above copyright notice, 
-    this list of conditions and the following disclaimer in the documentation 
-    and/or other materials provided with the distribution.
-    
-    * Neither the name of Morfologik nor the names of its contributors 
-    may be used to endorse or promote products derived from this software 
-    without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/src/main/readme/NOTICE b/src/main/readme/NOTICE
deleted file mode 100644
index 73fb1d7..0000000
--- a/src/main/readme/NOTICE
+++ /dev/null
@@ -1,11 +0,0 @@
-Apache OpenNLP
-Copyright 2010, 2013 The Apache Software Foundation
-
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
-
-The snowball stemmers in
-opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
-were developed by Martin Porter and Richard Boulton.
-The full snowball package is available from
-http://snowball.tartarus.org/


[16/16] opennlp git commit: OPENNLP-622 Added Morfologik to the root pom.xml / Changed artifact id to opennlp-morfologik-addon.

Posted by co...@apache.org.
OPENNLP-622 Added Morfologik to the root pom.xml / Changed artifact id to opennlp-morfologik-addon.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/49f8e25a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/49f8e25a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/49f8e25a

Branch: refs/heads/trunk
Commit: 49f8e25a1443b7338f8161a2e9c8e333d7a43d2b
Parents: 9b44804
Author: William Colen <wi...@gmail.com>
Authored: Wed Nov 9 19:10:26 2016 -0200
Committer: William Colen <wi...@gmail.com>
Committed: Wed Nov 9 19:10:26 2016 -0200

----------------------------------------------------------------------
 opennlp-morfologik-addon/pom.xml | 4 ++--
 pom.xml                          | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/49f8e25a/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 56d0e47..71d2c53 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -3,10 +3,10 @@
 	<modelVersion>4.0.0</modelVersion>
 
 	<groupId>org.apache.opennlp</groupId>
-	<artifactId>morfologik-addon</artifactId>
+	<artifactId>opennlp-morfologik-addon</artifactId>
 	<version>1.0-SNAPSHOT</version>
 	<packaging>jar</packaging>
-	<name>Morfologik Addon</name>
+	<name>Apache OpenNLP Morfologik Addon</name>
 
 	<url>http://maven.apache.org</url>
 	<build>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/49f8e25a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 34e53a1..91e5043 100644
--- a/pom.xml
+++ b/pom.xml
@@ -218,6 +218,7 @@
 		<module>opennlp-tools</module>
 		<module>opennlp-uima</module>
 		<module>opennlp-brat-annotator</module>
+		<module>opennlp-morfologik-addon</module>
 		<module>opennlp-docs</module>
 		<module>opennlp-distr</module>
 	</modules>


[03/16] opennlp git commit: OPENNLP-622 Updated to OpenNLP 1.6.0 and Morfologik 2.1.0

Posted by co...@apache.org.
OPENNLP-622 Updated to OpenNLP 1.6.0 and Morfologik 2.1.0


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/15c3fb72
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/15c3fb72
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/15c3fb72

Branch: refs/heads/trunk
Commit: 15c3fb720fcde96328e5c20e6a8994b7d4f7abc8
Parents: 78dd579
Author: William Colen <co...@apache.org>
Authored: Wed Jul 6 21:22:38 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Wed Jul 6 21:22:38 2016 +0000

----------------------------------------------------------------------
 pom.xml                                         |   6 +-
 .../builder/MorfologikDictionayBuilder.java     |  52 ++++----
 .../java/opennlp/morfologik/cmdline/CLI.java    | 128 +++++++++----------
 .../MorfologikDictionaryBuilderParams.java      |  13 +-
 .../MorfologikDictionaryBuilderTool.java        |  12 +-
 .../tagdict/MorfologikPOSTaggerFactory.java     |   8 +-
 .../opennlp/morfologik/util/MorfologikUtil.java |  36 ++++++
 .../builder/POSDictionayBuilderTest.java        |  42 +++---
 .../lemmatizer/MorfologikLemmatizerTest.java    |   4 +-
 .../tagdict/MorfologikTagDictionaryTest.java    |   4 +-
 10 files changed, 158 insertions(+), 147 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 51854f6..60f201e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -30,20 +30,20 @@
    <dependency>
       <groupId>org.carrot2</groupId>
       <artifactId>morfologik-stemming</artifactId>
-      <version>1.6.0</version>
+      <version>2.1.0</version>
       <scope>compile</scope>
     </dependency>
    <dependency>
       <groupId>org.carrot2</groupId>
       <artifactId>morfologik-tools</artifactId>
-      <version>1.6.0</version>
+      <version>2.1.0</version>
       <scope>compile</scope>
     </dependency>
 
     <dependency>
       <groupId>org.apache.opennlp</groupId>
       <artifactId>opennlp-tools</artifactId>
-      <version>1.6.0-SNAPSHOT</version>
+      <version>1.6.0</version>
     </dependency>
 
 	<dependency>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
index b8bcfbf..0131318 100644
--- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
+++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
@@ -23,12 +23,14 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.nio.charset.Charset;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Properties;
 
-import morfologik.stemming.Dictionary;
-import morfologik.tools.FSABuildTool;
+import morfologik.stemming.DictionaryMetadata;
+import morfologik.stemming.EncoderType;
+import morfologik.tools.FSACompile;
 import morfologik.tools.Launcher;
 
 /**
@@ -50,20 +52,20 @@ public class MorfologikDictionayBuilder {
    * @param separator
    *          a field separator, the default is '+'. If your tags contains '+'
    *          change to something else
-   * @param isUsePrefixes
-   *          if to compact using prefixes
+   * @param encoderType
+   *          the Morfologik enconder type
    * @param isUseInfixes
    *          if to compact using infixes
    * @throws Exception
    */
   public void build(File dictInFile, File dictOutFile, Charset encoding,
-      String separator, boolean isUsePrefixes, boolean isUseInfixes)
+      String separator, EncoderType encoderType)
       throws Exception {
-
-    File propertiesFile = new File(
-        Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
-    this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
-        isUsePrefixes, isUseInfixes);
+    Path propertiesPath = DictionaryMetadata
+        .getExpectedMetadataLocation(dictOutFile.toPath()); 
+    
+    this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator,
+        encoderType);
   }
 
   /**
@@ -87,33 +89,29 @@ public class MorfologikDictionayBuilder {
    * @throws Exception
    */
   public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
-      Charset encoding, String separator, boolean isUsePrefixes,
-      boolean isUseInfixes) throws Exception {
+      Charset encoding, String separator, EncoderType encoderType) throws Exception {
 
     // we need to execute tab2morph followed by fsa_build
 
-    File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+    File morph = tab2morph(dictInFile, separator, encoderType);
 
     fsaBuild(morph, dictOutFile);
 
     morph.delete();
 
     // now we create the properties files using the passed parameters
-    createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+    createProperties(encoding, separator, encoderType,
         propertiesOutFile);
   }
 
   void createProperties(Charset encoding, String separator,
-      boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+		  EncoderType encoderType, File propertiesFile)
       throws FileNotFoundException, IOException {
 
     Properties properties = new Properties();
     properties.setProperty("fsa.dict.separator", separator);
     properties.setProperty("fsa.dict.encoding", encoding.name());
-    properties.setProperty("fsa.dict.uses-prefixes",
-        Boolean.toString(isUsePrefixes));
-    properties.setProperty("fsa.dict.uses-infixes",
-        Boolean.toString(isUseInfixes));
+    properties.setProperty("fsa.dict.encoder", encoderType.name());
 
     OutputStream os = new FileOutputStream(propertiesFile);
     properties.store(os, "Morfologik POS Dictionary properties");
@@ -124,11 +122,12 @@ public class MorfologikDictionayBuilder {
   private void fsaBuild(File morph, File dictOutFile) throws Exception {
     String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
         dictOutFile.getAbsolutePath() };
-    FSABuildTool.main(params);
+    FSACompile.main(params);
+    // FSABuildTool.main(params);
   }
 
   private File tab2morph(File dictInFile, String separator,
-      boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+      EncoderType encoderType) throws Exception {
 
     // create tab2morph parameters
     List<String> tag2morphParams = new ArrayList<String>();
@@ -136,14 +135,9 @@ public class MorfologikDictionayBuilder {
 
     tag2morphParams.add("--annotation");
     tag2morphParams.add(separator);
-
-    if (isUsePrefixes) {
-      tag2morphParams.add("-pre");
-    }
-
-    if (isUseInfixes) {
-      tag2morphParams.add("-inf");
-    }
+    
+    tag2morphParams.add("--e");
+    tag2morphParams.add(encoderType.name());
 
     tag2morphParams.add("-i");
     tag2morphParams.add(dictInFile.getAbsolutePath());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/CLI.java b/src/main/java/opennlp/morfologik/cmdline/CLI.java
index 66a5151..f92d178 100644
--- a/src/main/java/opennlp/morfologik/cmdline/CLI.java
+++ b/src/main/java/opennlp/morfologik/cmdline/CLI.java
@@ -94,71 +94,71 @@ public final class CLI {
 				.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
 	}
 
-  public static void main(String[] args) {
 
-		if (args.length == 0) {
-			usage();
-			System.exit(0);
-		}
-
-		String toolArguments[] = new String[args.length - 1];
-		System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
-
-		String toolName = args[0];
-
-		// check for format
-		String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
-		int idx = toolName.indexOf(".");
-		if (-1 < idx) {
-			formatName = toolName.substring(idx + 1);
-			toolName = toolName.substring(0, idx);
-		}
-		CmdLineTool tool = toolLookupMap.get(toolName);
-
-		try {
-			if (null == tool) {
-				throw new TerminateToolException(1, "Tool " + toolName
-						+ " is not found.");
-			}
-
-			if ((0 == toolArguments.length && tool.hasParams())
-					|| 0 < toolArguments.length
-					&& "help".equals(toolArguments[0])) {
-				if (tool instanceof TypedCmdLineTool) {
-					System.out.println(((TypedCmdLineTool) tool)
-							.getHelp(formatName));
-				} else if (tool instanceof BasicCmdLineTool) {
-					System.out.println(tool.getHelp());
-				}
-
-				System.exit(0);
-			}
-
-			if (tool instanceof TypedCmdLineTool) {
-				((TypedCmdLineTool) tool).run(formatName, toolArguments);
-			} else if (tool instanceof BasicCmdLineTool) {
-				if (-1 == idx) {
-					((BasicCmdLineTool) tool).run(toolArguments);
-				} else {
-					throw new TerminateToolException(1, "Tool " + toolName
-							+ " does not support formats.");
-				}
-			} else {
-				throw new TerminateToolException(1, "Tool " + toolName
-						+ " is not supported.");
-			}
-		} catch (TerminateToolException e) {
-
-			if (e.getMessage() != null) {
-				System.err.println(e.getMessage());
-			}
+	  @SuppressWarnings("rawtypes")
+    public static void main(String[] args) {
+
+	    if (args.length == 0) {
+	      usage();
+	      System.exit(0);
+	    }
+
+	    String toolArguments[] = new String[args.length -1];
+	    System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+	    String toolName = args[0];
+
+	    //check for format
+	    String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+	    int idx = toolName.indexOf(".");
+	    if (-1 < idx) {
+	      formatName = toolName.substring(idx + 1);
+	      toolName = toolName.substring(0, idx);
+	    }
+	    CmdLineTool tool = toolLookupMap.get(toolName);
+
+	    try {
+	      if (null == tool) {
+	        throw new TerminateToolException(1, "Tool " + toolName + " is not found.");
+	      }
+
+	      if ((0 == toolArguments.length && tool.hasParams()) ||
+	          0 < toolArguments.length && "help".equals(toolArguments[0])) {
+	          if (tool instanceof TypedCmdLineTool) {
+	            System.out.println(((TypedCmdLineTool) tool).getHelp(formatName));
+	          } else if (tool instanceof BasicCmdLineTool) {
+	            System.out.println(tool.getHelp());
+	          }
+
+	          System.exit(0);
+	      }
+
+	      if (tool instanceof TypedCmdLineTool) {
+	        ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+	      } else if (tool instanceof BasicCmdLineTool) {
+	        if (-1 == idx) {
+	          ((BasicCmdLineTool) tool).run(toolArguments);
+	        } else {
+	          throw new TerminateToolException(1, "Tool " + toolName + " does not support formats.");
+	        }
+	      } else {
+	        throw new TerminateToolException(1, "Tool " + toolName + " is not supported.");
+	      }
+	    }
+	    catch (TerminateToolException e) {
+
+	      if (e.getMessage() != null) {
+	        System.err.println(e.getMessage());
+	      }
+
+	      if (e.getCause() != null) {
+	        System.err.println(e.getCause().getMessage());
+	        e.getCause().printStackTrace(System.err);
+	      }
+
+	      System.exit(e.getCode());
+	    }
+	  }
 
-			if (e.getCause() != null) {
-				System.err.println(e.getCause().getMessage());
-				e.getCause().printStackTrace(System.err);
-			}
 
-			System.exit(e.getCode());
-		}
-	}
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
index 0b1e896..193599b 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
 
 import java.io.File;
 
+import morfologik.stemming.EncoderType;
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.EncodingParameter;
@@ -37,13 +38,9 @@ interface MorfologikDictionaryBuilderParams extends EncodingParameter {
   @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
   @OptionalParameter(defaultValue = "+")
   String getFSADictSeparator();
-
-  @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
-  @OptionalParameter(defaultValue = "true")
-  Boolean getUsesPrefixes();
-
-  @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
-  @OptionalParameter(defaultValue = "true")
-  Boolean getUsesInfixes();
+  
+  @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ")
+  @OptionalParameter(defaultValue = "prefix")
+  EncoderType getEncoderType();
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
index 9da7e7d..741515e 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
@@ -17,10 +17,11 @@
 
 package opennlp.morfologik.cmdline.builder;
 
+import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
+
 import java.io.File;
 import java.nio.charset.Charset;
 
-import morfologik.stemming.Dictionary;
 import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.tools.cmdline.BasicCmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -54,18 +55,11 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
     try {
       builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
-          params.getFSADictSeparator(), params.getUsesPrefixes(),
-          params.getUsesInfixes());
+          params.getFSADictSeparator(), params.getEncoderType());
     } catch (Exception e) {
       throw new TerminateToolException(-1,
           "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
     }
 
   }
-
-  private File getExpectedPropertiesFile(File dictFile) {
-    return new File(Dictionary.getExpectedFeaturesName(dictFile
-        .getAbsolutePath()));
-  }
-
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index 9b74ae5..f022a86 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,6 +17,8 @@
 
 package opennlp.morfologik.tagdict;
 
+import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
+
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
@@ -72,8 +74,8 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
     // now we try to load it...
     try {
       this.dictData = Files.readAllBytes(Paths.get(path));
-      this.dictInfo = Files.readAllBytes(Paths
-          .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+      this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
+          .toPath());
 
       this.dict = createMorfologikDictionary(dictData, dictInfo);
 
@@ -163,7 +165,7 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
       throws IOException {
     morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
-        .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+        .read(new ByteArrayInputStream(data), new ByteArrayInputStream(
             info));
     return new MorfologikTagDictionary(dict);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/util/MorfologikUtil.java b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
new file mode 100644
index 0000000..bd4d1a4
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/util/MorfologikUtil.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.util;
+
+import java.io.File;
+
+import morfologik.stemming.DictionaryMetadata;
+
+public class MorfologikUtil {
+  
+  public static File getExpectedPropertiesFile(File dictFile) {
+    return DictionaryMetadata.getExpectedMetadataLocation(dictFile.toPath())
+        .toFile();
+  }
+  
+  public static File getExpectedPropertiesFile(String dictFile) {
+    File f = new File(dictFile);
+    return getExpectedPropertiesFile(f);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
index 16d1dac..730025c 100644
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
@@ -25,6 +25,7 @@ import java.nio.charset.Charset;
 import java.util.Properties;
 
 import junit.framework.TestCase;
+import morfologik.stemming.EncoderType;
 import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
 
 import org.junit.Test;
@@ -40,8 +41,7 @@ public class POSDictionayBuilderTest extends TestCase {
     File dictOutFile = File.createTempFile(
         POSDictionayBuilderTest.class.getName(), ".dict");
 
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
-        true);
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
 
     MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
         .toURL());
@@ -54,40 +54,28 @@ public class POSDictionayBuilderTest extends TestCase {
 
     Charset c = Charset.forName("iso-8859-1");
     String sep = "_";
-    boolean pref = true;
-    boolean inf = true;
-    Properties p = createPropertiesHelper(c, sep, pref, inf);
+    
+    EncoderType encoderType = EncoderType.PREFIX;
+    Properties p = createPropertiesHelper(c, sep, encoderType);
 
     assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
     assertEquals(sep, p.getProperty("fsa.dict.separator"));
-    assertEquals(pref,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
-    assertEquals(inf,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
-
-    pref = false;
-    inf = true;
-    p = createPropertiesHelper(c, sep, pref, inf);
-    assertEquals(pref,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
-    assertEquals(inf,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
-
-    pref = true;
-    inf = false;
-    p = createPropertiesHelper(c, sep, pref, inf);
-    assertEquals(pref,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
-    assertEquals(inf,
-        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+    assertEquals(encoderType,
+        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+    
+    encoderType = EncoderType.SUFFIX;
+    p = createPropertiesHelper(c, sep, encoderType);
+    assertEquals(encoderType,
+        EncoderType.valueOf(p.getProperty("fsa.dict.encoder")));
+
   }
 
   private Properties createPropertiesHelper(Charset c, String sep,
-      boolean pref, boolean inf) throws IOException {
+      EncoderType encoderType) throws IOException {
     MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
     File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
         ".info");
-    builder.createProperties(c, sep, pref, inf, f);
+    builder.createProperties(c, sep, encoderType, f);
 
     InputStream is = new FileInputStream(f);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
index 6fd6ec1..87fc2cc 100644
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
@@ -5,6 +5,7 @@ import static org.junit.Assert.assertEquals;
 import java.io.File;
 import java.nio.charset.Charset;
 
+import morfologik.stemming.EncoderType;
 import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
 import opennlp.tools.lemmatizer.DictionaryLemmatizer;
@@ -34,8 +35,7 @@ public class MorfologikLemmatizerTest {
     File dictOutFile = File.createTempFile(
         POSDictionayBuilderTest.class.getName(), ".dict");
 
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
-        true);
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
 
     MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
         .toURL());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15c3fb72/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
index def97b6..d605e15 100644
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
@@ -9,6 +9,7 @@ import java.util.Arrays;
 import java.util.List;
 
 import morfologik.stemming.Dictionary;
+import morfologik.stemming.EncoderType;
 import opennlp.morfologik.builder.MorfologikDictionayBuilder;
 import opennlp.morfologik.builder.POSDictionayBuilderTest;
 import opennlp.morfologik.tagdict.MorfologikTagDictionary;
@@ -80,8 +81,7 @@ public class MorfologikTagDictionaryTest {
     File dictOutFile = File.createTempFile(
         POSDictionayBuilderTest.class.getName(), ".dict");
 
-    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
-        true);
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX);
 
     MorfologikTagDictionary ml = new MorfologikTagDictionary(
         Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);


[09/16] opennlp git commit: OPENNLP-622 Included transitive dependencies

Posted by co...@apache.org.
OPENNLP-622 Included transitive dependencies


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/be7e6bab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/be7e6bab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/be7e6bab

Branch: refs/heads/trunk
Commit: be7e6bab698d2a6fab35e254cf39970584208361
Parents: 6ada5de
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 16:27:40 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 16:27:40 2016 +0000

----------------------------------------------------------------------
 src/main/assembly/bin.xml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/be7e6bab/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
index bbc1607..ab4f6da 100644
--- a/src/main/assembly/bin.xml
+++ b/src/main/assembly/bin.xml
@@ -24,11 +24,23 @@
   <formats>
     <format>tar.gz</format>
     <format>zip</format>
+    <format>dir</format>
   </formats>
   
     <includeBaseDirectory>true</includeBaseDirectory>
 	<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
   
+	<dependencySets>
+		<dependencySet>
+			<scope>runtime</scope>
+			<unpack>false</unpack>
+			<useProjectArtifact>false</useProjectArtifact>
+			<fileMode>644</fileMode>
+			<directoryMode>755</directoryMode>
+			<outputDirectory>lib</outputDirectory>
+			<useTransitiveDependencies>true</useTransitiveDependencies>
+		</dependencySet>
+	</dependencySets>
 	
 	<fileSets>
 	    <fileSet>


[12/16] opennlp git commit: OPENNLP-622 Preparing to migrate morfologik-addon to main repository

Posted by co...@apache.org.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
deleted file mode 100644
index 0a7ba48..0000000
--- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.builder;
-
-import java.io.File;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-
-import junit.framework.TestCase;
-import morfologik.stemming.DictionaryMetadata;
-import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
-
-import org.junit.Test;
-
-public class POSDictionayBuilderTest extends TestCase {
-
-  @Test
-  public void testBuildDictionary() throws Exception {
-    
-    Path output = createMorfologikDictionary();
-
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
-
-    assertNotNull(ml);
-  }
-  
-  public static Path createMorfologikDictionary() throws Exception {
-    Path tabFilePath = File.createTempFile(
-        POSDictionayBuilderTest.class.getName(), ".txt").toPath();
-    Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath);
-    
-    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
-        "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING);
-    Files.copy(POSDictionayBuilderTest.class.getResourceAsStream(
-        "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING);
-    
-    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-    
-    return builder.build(tabFilePath);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
deleted file mode 100644
index 6b7525e..0000000
--- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package opennlp.morfologik.lemmatizer;
-
-import static org.junit.Assert.assertEquals;
-
-import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.lemmatizer.DictionaryLemmatizer;
-
-import org.junit.Test;
-
-public class MorfologikLemmatizerTest {
-
-  @Test
-  public void testLemmatizeInsensitive() throws Exception {
-    DictionaryLemmatizer dict = createDictionary(false);
-
-    assertEquals("casar", dict.lemmatize("casa", "V"));
-    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
-
-    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
-
-  }
-
-  private MorfologikLemmatizer createDictionary(boolean caseSensitive)
-      throws Exception {
-
-    Path output = POSDictionayBuilderTest.createMorfologikDictionary();
-
-    MorfologikLemmatizer ml = new MorfologikLemmatizer(output);
-
-    return ml;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
deleted file mode 100644
index c6c9e04..0000000
--- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
+++ /dev/null
@@ -1,78 +0,0 @@
-package opennlp.morfologik.tagdict;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.util.Arrays;
-import java.util.List;
-
-import morfologik.stemming.Dictionary;
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.postag.TagDictionary;
-
-import org.junit.Test;
-
-public class MorfologikTagDictionaryTest {
-
-  @Test
-  public void testNoLemma() throws Exception {
-    MorfologikTagDictionary dict = createDictionary(false);
-
-    List<String> tags = Arrays.asList(dict.getTags("carro"));
-    assertEquals(1, tags.size());
-    assertTrue(tags.contains("NOUN"));
-
-  }
-
-  @Test
-  public void testPOSDictionaryInsensitive() throws Exception {
-    TagDictionary dict = createDictionary(false);
-
-    List<String> tags = Arrays.asList(dict.getTags("casa"));
-    assertEquals(2, tags.size());
-    assertTrue(tags.contains("NOUN"));
-    assertTrue(tags.contains("V"));
-
-    // this is the behavior of case insensitive dictionary
-    // if we search it using case insensitive, Casa as a proper noun
-    // should be lower case in the dictionary
-    tags = Arrays.asList(dict.getTags("Casa"));
-    assertEquals(2, tags.size());
-    assertTrue(tags.contains("NOUN"));
-    assertTrue(tags.contains("V"));
-
-  }
-
-  @Test
-  public void testPOSDictionarySensitive() throws Exception {
-    TagDictionary dict = createDictionary(true);
-
-    List<String> tags = Arrays.asList(dict.getTags("casa"));
-    assertEquals(2, tags.size());
-    assertTrue(tags.contains("NOUN"));
-    assertTrue(tags.contains("V"));
-
-    // this is the behavior of case insensitive dictionary
-    // if we search it using case insensitive, Casa as a proper noun
-    // should be lower case in the dictionary
-    tags = Arrays.asList(dict.getTags("Casa"));
-    assertEquals(1, tags.size());
-    assertTrue(tags.contains("PROP"));
-
-  }
-
-  private MorfologikTagDictionary createDictionary(boolean caseSensitive)
-      throws Exception {
-    return this.createDictionary(caseSensitive, null);
-  }
-
-  private MorfologikTagDictionary createDictionary(boolean caseSensitive,
-      List<String> constant) throws Exception {
-
-    Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary());
-    MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive);
-
-    return ml;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
deleted file mode 100644
index 7341a02..0000000
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.morfologik.tagdict;
-
-import static org.junit.Assert.*;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.file.Path;
-
-import opennlp.morfologik.builder.POSDictionayBuilderTest;
-import opennlp.tools.postag.POSModel;
-import opennlp.tools.postag.POSSample;
-import opennlp.tools.postag.POSTaggerFactory;
-import opennlp.tools.postag.POSTaggerME;
-import opennlp.tools.postag.TagDictionary;
-import opennlp.tools.postag.WordTagSampleStream;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelType;
-
-import org.junit.Test;
-
-/**
- * Tests for the {@link POSTaggerFactory} class.
- */
-public class POSTaggerFactoryTest {
-
-  private static ObjectStream<POSSample> createSampleStream()
-      throws IOException {
-    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
-        .getResourceAsStream("AnnotatedSentences.txt");
-
-    return new WordTagSampleStream((new InputStreamReader(in)));
-  }
-
-  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
-      throws IOException {
-    return POSTaggerME.train("en", createSampleStream(),
-        TrainingParameters.defaultParams(), factory);
-  }
-
-  @Test
-  public void testPOSTaggerWithCustomFactory() throws Exception {
-
-    Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
-    POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
-    TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
-    inFactory.setTagDictionary(inDict);
-
-    POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
-
-    POSTaggerFactory factory = posModel.getFactory();
-    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
-
-    factory = null;
-    
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    posModel.serialize(out);
-    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
-
-    POSModel fromSerialized = new POSModel(in);
-
-    factory = fromSerialized.getFactory();
-    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
-    
-    assertEquals(2, factory.getTagDictionary().getTags("casa").length);
-  }
-
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/AnnotatedSentences.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt
deleted file mode 100644
index b40be87..0000000
--- a/src/test/resources/AnnotatedSentences.txt
+++ /dev/null
@@ -1,136 +0,0 @@
-Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._.
-I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._.
-So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._.
-She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._.
-I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._.
-
-Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._.
-Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._.
-I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._.
-As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._.
-The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._.
-But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._.
-It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._.
-She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra\ufffde_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._.
-
-Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_.
-About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._.
-That_DT sounds_VBZ good_JJ ._.
-So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra\ufffde_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._.
-I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra\ufffde_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._.
-Thank_VB you_PRP very_RB much_RB !_.
-
-Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._.
-I_PRP 'm_VBP in_IN N\ufffdrnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._.
-Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra\ufffde_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf\ufffd_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._.
-Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_.
-
-My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._.
-We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._.
-Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._.
-As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._.
-But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._.
-Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra\ufffde_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._.
-
-I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._.
-I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._.
-I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._.
-I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._.
-
-Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._.
-As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._.
-The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._.
-Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._.
-
-An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._.
-Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._.
-She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._.
-But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._.
-So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf\ufffd_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._.
-It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._.
-She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._.
-
-Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_.
-They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._.
-Are_VBP they_PRP made_VBN of_IN leather_NN ?_.
-No,_NNP that_DT 's_VBZ faked_VBN ._.
-But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._.
-I_PRP got_VBD them_PRP from_IN Hamburg._NNP
-Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra\ufffde_NNP ?_.
-It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._.
-I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._.
-Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_.
-Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra\ufffde_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._.
-I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._.
-His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._.
-
-Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_.
-My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_.
-How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_.
-His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._.
-I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN
-I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP
-Is_VBZ that_DT right_NN ?_.
-Yes_UH ,_, it_PRP definitely_RB is_VBZ ._.
-So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra\ufffde_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._.
-Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._.
-Bye_NNP !_.
-
-On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._.
-The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._.
-Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._.
-But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._.
-Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra\ufffde_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._.
-
-Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._.
-I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._.
-I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._.
-An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G\ufffdrtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._.
-Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G\ufffdrtnerweg_NNP 25_CD ._.
-The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra\ufffde_NNP 15_CD ._.
-He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._.
-
-Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._.
-He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._.
-Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._.
-Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._.
-It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._.
-
-When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._.
-He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._.
-One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._.
-So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._.
-She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra\ufffde_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._.
-Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._.
-
-On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._.
-The_DT driver_NN got_VBD badly_RB injured_VBN ._.
-Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._.
-A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._.
-He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._.
-He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L\ufffdw-Stra\ufffde_NNP 13_CD in_IN 84630_CD Gauting_NNP ._.
-The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._.
-Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._.
-
-Hi_NNP ,_, how_WRB are_VBP you_PRP ?_.
-Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_.
-Yeah_UH for_IN sure_JJ ._.
-How_WRB did_VBD you_PRP know_VB that_DT ?_.
-I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._.
-Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._.
-Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_.
-Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._.
-But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._.
-I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra\ufffde_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._.
-The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._.
-I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._.
-Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_.
-
-My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._.
-When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._.
-My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra\ufffde_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._.
-But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._.
-So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._.
-Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._.
-We_PRP live_VBP in_IN B\ufffdrgerstra\ufffde_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._.
-I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/dictionaryWithLemma.info
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info
deleted file mode 100644
index ad5fe8d..0000000
--- a/src/test/resources/dictionaryWithLemma.info
+++ /dev/null
@@ -1,15 +0,0 @@
-#
-# REQUIRED PROPERTIES
-#
-
-# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding.
-fsa.dict.separator=,
-
-# The charset in which the input is encoded. UTF-8 is strongly recommended.
-fsa.dict.encoding=UTF-8
-
-# The type of lemma-inflected form encoding compression that precedes automaton
-# construction. Allowed values: [suffix, infix, prefix, none].
-# Details are in Daciuk's paper and in the code. 
-# Leave at 'prefix' if not sure.
-fsa.dict.encoder=prefix
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/772f31ff/src/test/resources/dictionaryWithLemma.txt
----------------------------------------------------------------------
diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt
deleted file mode 100644
index 09d39e3..0000000
--- a/src/test/resources/dictionaryWithLemma.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-casa,casa,NOUN
-casar,casa,V
-casar,casar,V-INF
-Casa,Casa,PROP
-casa,casinha,NOUN
-casa,casona,NOUN
-menino,menina,NOUN
-menino,menino,NOUN
-menino,menin�o,NOUN
-menino,menininho,NOUN
-carro,carro,NOUN
\ No newline at end of file


[08/16] opennlp git commit: OPENNLP-622 Fixed CLI launcher

Posted by co...@apache.org.
OPENNLP-622 Fixed CLI launcher


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6ada5de2
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6ada5de2
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6ada5de2

Branch: refs/heads/trunk
Commit: 6ada5de24aa39ce90733a477d8a947d2b3b60568
Parents: d1fab8c
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 16:26:43 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 16:26:43 2016 +0000

----------------------------------------------------------------------
 src/main/bin/morfologik-addon     | 2 +-
 src/main/bin/morfologik-addon.bat | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6ada5de2/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
index 70fb1d7..9b0faf9 100755
--- a/src/main/bin/morfologik-addon
+++ b/src/main/bin/morfologik-addon
@@ -32,4 +32,4 @@ fi
 # Might fail if $0 is a link
 OPENNLP_HOME=`dirname "$0"`/..
 
-$JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/apache-opennlp-morfologik-addon-*.jar $@
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.morfologik.cmdline.CLI $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6ada5de2/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
index a69fbd6..aeec31f 100644
--- a/src/main/bin/morfologik-addon.bat
+++ b/src/main/bin/morfologik-addon.bat
@@ -40,7 +40,7 @@ IF "%OPENNLP_HOME%" == "" (
 )
 
 REM #  Get the library JAR file name (JIRA OPENNLP-554)
-FOR %%A IN ("%OPENNLP_HOME%\lib\apache-opennlp-morfologik-addon-*.jar") DO SET JAR_FILE=%%A
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO SET JAR_FILE=%%A
 
 %JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
 


[06/16] opennlp git commit: OPENNLP-622 Added distribution assembly files

Posted by co...@apache.org.
OPENNLP-622 Added distribution assembly files


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f588858a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f588858a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f588858a

Branch: refs/heads/trunk
Commit: f588858a45c8992330beb171f8da079a0820961b
Parents: 3ceb554
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 03:53:06 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 03:53:06 2016 +0000

----------------------------------------------------------------------
 bin/morfologik-addon              |  20 +++
 bin/morfologik-addon.bat          |  21 +++
 pom.xml                           | 150 ++++++++++++++-------
 src/main/assembly/bin.xml         |  79 +++++++++++
 src/main/assembly/src.xml         |  39 ++++++
 src/main/bin/morfologik-addon     |  35 +++++
 src/main/bin/morfologik-addon.bat |  47 +++++++
 src/main/readme/LICENSE           | 230 +++++++++++++++++++++++++++++++++
 src/main/readme/NOTICE            |  11 ++
 9 files changed, 583 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon b/bin/morfologik-addon
new file mode 100755
index 0000000..ccc635e
--- /dev/null
+++ b/bin/morfologik-addon
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=$*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/bin/morfologik-addon.bat b/bin/morfologik-addon.bat
new file mode 100644
index 0000000..26a4778
--- /dev/null
+++ b/bin/morfologik-addon.bat
@@ -0,0 +1,21 @@
+@ECHO OFF
+
+REM #   Licensed to the Apache Software Foundation (ASF) under one
+REM #   or more contributor license agreements.  See the NOTICE file
+REM #   distributed with this work for additional information
+REM #   regarding copyright ownership.  The ASF licenses this file
+REM #   to you under the Apache License, Version 2.0 (the
+REM #   "License"); you may not use this file except in compliance
+REM #   with the License.  You may obtain a copy of the License at
+REM #
+REM #    http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM #   Unless required by applicable law or agreed to in writing,
+REM #   software distributed under the License is distributed on an
+REM #   
+REM #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM #   KIND, either express or implied.  See the License for the
+REM #   specific language governing permissions and limitations
+REM #   under the License.
+
+mvn -e -q exec:java "-Dexec.mainClass=opennlp.morfologik.cmdline.CLI" "-Dexec.args=%*"

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 60f201e..56d0e47 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,57 +1,109 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
 
-  <groupId>org.apache.opennlp</groupId>
-  <artifactId>morfologik-addon</artifactId>
-  <version>1.0-SNAPSHOT</version>
-  <packaging>jar</packaging>
-  <name>Morfologik Addon</name>
+	<groupId>org.apache.opennlp</groupId>
+	<artifactId>morfologik-addon</artifactId>
+	<version>1.0-SNAPSHOT</version>
+	<packaging>jar</packaging>
+	<name>Morfologik Addon</name>
 
-  <url>http://maven.apache.org</url>
-    <build>
-        <plugins>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-compiler-plugin</artifactId>
-                <version>2.3.2</version>
-                <configuration>
-                    <source>1.7</source>
-                    <target>1.7</target>
-                </configuration>
-            </plugin>
-        </plugins>
-    </build>
-    <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-  </properties>
+	<url>http://maven.apache.org</url>
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>2.3.2</version>
+				<configuration>
+					<source>1.7</source>
+					<target>1.7</target>
+				</configuration>
+			</plugin>
+			<plugin>
+				<artifactId>maven-assembly-plugin</artifactId>
+				<executions>
+					<execution>
+						<id>bundle-project-sources</id>
+						<phase>package</phase>
+						<goals>
+							<goal>single</goal>
+						</goals>
+						<configuration>
+							<descriptors>
+								<descriptor>src/main/assembly/bin.xml</descriptor>
+								<descriptor>src/main/assembly/src.xml</descriptor>
+							</descriptors>
+							<!-- Tar package is only compatible with gnu tar,
+							     many file have more than 100 chars.
+							     Right now only javadoc files are too long.
+							 -->
+							 <tarLongFileMode>gnu</tarLongFileMode>
+							 
+							 <finalName>apache-opennlp-morfologik-addon-${project.version}</finalName>
+						</configuration>
+					</execution>
+				</executions>
+			</plugin>
+			<plugin> 
+	        <artifactId>maven-antrun-plugin</artifactId> 
+	        <version>1.6</version> 
+	        <executions> 
+	          <execution> 
+	            <id>generate checksums for binary artifacts</id> 
+	            <goals><goal>run</goal></goals> 
+	            <phase>verify</phase> 
+	            <configuration> 
+	              <target> 
+	                <checksum algorithm="sha1" format="MD5SUM"> 
+	                  <fileset dir="${project.build.directory}"> 
+	                    <include name="*.zip" /> 
+	                    <include name="*.gz" /> 
+	                  </fileset> 
+	                </checksum> 
+	                <checksum algorithm="md5" format="MD5SUM"> 
+	                  <fileset dir="${project.build.directory}"> 
+	                    <include name="*.zip" /> 
+	                    <include name="*.gz" /> 
+	                  </fileset> 
+	                </checksum> 
+	              </target> 
+	            </configuration> 
+	          </execution> 
+	        </executions> 
+	      </plugin>
+		</plugins>
+	</build>
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+	</properties>
 
-  <dependencies>
-   <dependency>
-      <groupId>org.carrot2</groupId>
-      <artifactId>morfologik-stemming</artifactId>
-      <version>2.1.0</version>
-      <scope>compile</scope>
-    </dependency>
-   <dependency>
-      <groupId>org.carrot2</groupId>
-      <artifactId>morfologik-tools</artifactId>
-      <version>2.1.0</version>
-      <scope>compile</scope>
-    </dependency>
+	<dependencies>
+		<dependency>
+			<groupId>org.carrot2</groupId>
+			<artifactId>morfologik-stemming</artifactId>
+			<version>2.1.0</version>
+			<scope>compile</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.carrot2</groupId>
+			<artifactId>morfologik-tools</artifactId>
+			<version>2.1.0</version>
+			<scope>compile</scope>
+		</dependency>
 
-    <dependency>
-      <groupId>org.apache.opennlp</groupId>
-      <artifactId>opennlp-tools</artifactId>
-      <version>1.6.0</version>
-    </dependency>
+		<dependency>
+			<groupId>org.apache.opennlp</groupId>
+			<artifactId>opennlp-tools</artifactId>
+			<version>1.6.0</version>
+		</dependency>
 
-	<dependency>
-		<groupId>junit</groupId>
-		<artifactId>junit</artifactId>
-		<version>4.8.1</version>
-		<scope>test</scope>
-	</dependency>
+		<dependency>
+			<groupId>junit</groupId>
+			<artifactId>junit</artifactId>
+			<version>4.8.1</version>
+			<scope>test</scope>
+		</dependency>
 
-  </dependencies>
+	</dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/assembly/bin.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/bin.xml b/src/main/assembly/bin.xml
new file mode 100644
index 0000000..bbc1607
--- /dev/null
+++ b/src/main/assembly/bin.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<assembly>
+  <id>bin</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>zip</format>
+  </formats>
+  
+    <includeBaseDirectory>true</includeBaseDirectory>
+	<baseDirectory>/apache-opennlp-morfologik-addon-${project.version}</baseDirectory>
+  
+	
+	<fileSets>
+	    <fileSet>
+	    	<directory>src/main/readme</directory>
+	    	<outputDirectory></outputDirectory>
+	    	<fileMode>644</fileMode>
+	    	<directoryMode>755</directoryMode>      
+	    </fileSet>
+		
+	    <fileSet>
+	      <directory>.</directory>
+	      <outputDirectory></outputDirectory>
+	      <filtered>true</filtered>
+	      <fileMode>644</fileMode>
+	      <directoryMode>755</directoryMode> 
+	      <includes>
+	        <include>README</include>
+	        <include>RELEASE_NOTES.html</include>
+	      </includes>       
+	    </fileSet>
+	    
+	    <fileSet>
+	      <directory>target</directory>
+	      <outputDirectory></outputDirectory>
+	      <fileMode>644</fileMode>
+	      <directoryMode>755</directoryMode> 
+	      <includes>
+	        <include>issuesFixed/**</include>      
+	      </includes>       
+	    </fileSet>
+	    
+		<fileSet>
+			<directory>src/main/bin</directory>
+			<fileMode>755</fileMode>
+			<directoryMode>755</directoryMode>
+			<outputDirectory>bin</outputDirectory>
+		</fileSet>
+		
+		  <fileSet>
+		    <directory>target</directory>
+		    <outputDirectory>lib</outputDirectory>
+		    <includes>
+		      <include>morfologik-addon-*.jar</include>
+		    </includes>
+		  </fileSet>
+		
+	</fileSets>
+</assembly>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/assembly/src.xml
----------------------------------------------------------------------
diff --git a/src/main/assembly/src.xml b/src/main/assembly/src.xml
new file mode 100644
index 0000000..cdcc9d3
--- /dev/null
+++ b/src/main/assembly/src.xml
@@ -0,0 +1,39 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<assembly>
+  <id>src</id>
+  <formats>
+    <format>tar.gz</format>
+    <format>zip</format>
+  </formats>
+  
+  <baseDirectory>/apache-opennlp-${project.version}-src</baseDirectory>
+  
+  <fileSets>
+    <fileSet>
+      <directory>../</directory>
+      <outputDirectory></outputDirectory>
+      <excludes>
+        <exclude>**/target/**</exclude>
+        <exclude>**/.*/**</exclude>
+        <exclude>**/pom.xml.releaseBackup</exclude>
+        <exclude>**/release.properties</exclude>
+      </excludes>
+    </fileSet>
+  </fileSets>
+</assembly>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/bin/morfologik-addon
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon b/src/main/bin/morfologik-addon
new file mode 100755
index 0000000..70fb1d7
--- /dev/null
+++ b/src/main/bin/morfologik-addon
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+# Note:  Do not output anything in this script file, any output
+#        may be inadvertantly placed in any output files if
+#        output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    JAVACMD="$JAVA_HOME/bin/java"
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -jar $OPENNLP_HOME/lib/apache-opennlp-morfologik-addon-*.jar $@

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/bin/morfologik-addon.bat
----------------------------------------------------------------------
diff --git a/src/main/bin/morfologik-addon.bat b/src/main/bin/morfologik-addon.bat
new file mode 100644
index 0000000..a69fbd6
--- /dev/null
+++ b/src/main/bin/morfologik-addon.bat
@@ -0,0 +1,47 @@
+@ECHO off
+
+REM #   Licensed to the Apache Software Foundation (ASF) under one
+REM #   or more contributor license agreements.  See the NOTICE file
+REM #   distributed with this work for additional information
+REM #   regarding copyright ownership.  The ASF licenses this file
+REM #   to you under the Apache License, Version 2.0 (the
+REM #   "License"); you may not use this file except in compliance
+REM #   with the License.  You may obtain a copy of the License at
+REM #
+REM #    http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM #   Unless required by applicable law or agreed to in writing,
+REM #   software distributed under the License is distributed on an
+REM #   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM #   KIND, either express or implied.  See the License for the
+REM #   specific language governing permissions and limitations
+REM #   under the License.
+
+REM # Note:  Do not output anything in this script file, any output
+REM #        may be inadvertantly placed in any output files if
+REM #        output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+	IF "%JAVA_HOME%" == "" (
+		SET JAVA_CMD=java 
+	) ELSE (
+		REM # Keep JAVA_HOME to short-name without spaces
+		FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+	)
+)
+
+REM #  Should work with Windows XP and greater.  If not, specify the path to where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+	SET OPENNLP_HOME=%~sp0..
+) ELSE (
+	REM # Keep OPENNLP_HOME to short-name without spaces
+	FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+
+REM #  Get the library JAR file name (JIRA OPENNLP-554)
+FOR %%A IN ("%OPENNLP_HOME%\lib\apache-opennlp-morfologik-addon-*.jar") DO SET JAR_FILE=%%A
+
+%JAVA_CMD% -Xmx1024m -jar %JAR_FILE% %*
+
+ENDLOCAL
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/readme/LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/LICENSE b/src/main/readme/LICENSE
new file mode 100644
index 0000000..576b4cf
--- /dev/null
+++ b/src/main/readme/LICENSE
@@ -0,0 +1,230 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+The following license applies to the Snowball stemmers:
+        
+        Copyright (c) 2001, Dr Martin Porter
+        Copyright (c) 2002, Richard Boulton
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions are met:
+        
+            * Redistributions of source code must retain the above copyright notice,
+            * this list of conditions and the following disclaimer.
+            * Redistributions in binary form must reproduce the above copyright
+            * notice, this list of conditions and the following disclaimer in the
+            * documentation and/or other materials provided with the distribution.
+            * Neither the name of the copyright holders nor the names of its contributors
+            * may be used to endorse or promote products derived from this software
+            * without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+        AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+        IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+        DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+        FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+        DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+        SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+        CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+        OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f588858a/src/main/readme/NOTICE
----------------------------------------------------------------------
diff --git a/src/main/readme/NOTICE b/src/main/readme/NOTICE
new file mode 100644
index 0000000..73fb1d7
--- /dev/null
+++ b/src/main/readme/NOTICE
@@ -0,0 +1,11 @@
+Apache OpenNLP
+Copyright 2010, 2013 The Apache Software Foundation
+
+This product includes software developed at
+The Apache Software Foundation (http://www.apache.org/).
+
+The snowball stemmers in
+opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
+were developed by Martin Porter and Richard Boulton.
+The full snowball package is available from
+http://snowball.tartarus.org/


[11/16] opennlp git commit: OPENNLP-622 Added Morfologik license

Posted by co...@apache.org.
OPENNLP-622 Added Morfologik license


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/0cced84d
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/0cced84d
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/0cced84d

Branch: refs/heads/trunk
Commit: 0cced84d1e364959616235f87742e28353e81779
Parents: 60a3b24
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 22:09:05 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 22:09:05 2016 +0000

----------------------------------------------------------------------
 src/main/readme/MORFOLOGIK-LICENSE | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/0cced84d/src/main/readme/MORFOLOGIK-LICENSE
----------------------------------------------------------------------
diff --git a/src/main/readme/MORFOLOGIK-LICENSE b/src/main/readme/MORFOLOGIK-LICENSE
new file mode 100644
index 0000000..0554010
--- /dev/null
+++ b/src/main/readme/MORFOLOGIK-LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2006 Dawid Weiss
+Copyright (c) 2007-2015 Dawid Weiss, Marcin Mi\u0142kowski
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+    this list of conditions and the following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above copyright notice, 
+    this list of conditions and the following disclaimer in the documentation 
+    and/or other materials provided with the distribution.
+    
+    * Neither the name of Morfologik nor the names of its contributors 
+    may be used to endorse or promote products derived from this software 
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file


[07/16] opennlp git commit: OPENNLP-622 Fixed issues related to command line.

Posted by co...@apache.org.
OPENNLP-622 Fixed issues related to command line.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d1fab8cd
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d1fab8cd
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d1fab8cd

Branch: refs/heads/trunk
Commit: d1fab8cd4215ddf65ce98ef6aae2bc06720be742
Parents: f588858
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 19:18:54 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 19:18:54 2016 +0000

----------------------------------------------------------------------
 .../builder/XMLDictionaryToTableParams.java     | 11 ++++-
 .../builder/XMLDictionaryToTableTool.java       | 51 ++++++++++++++++++--
 .../tagdict/MorfologikPOSTaggerFactory.java     | 26 ----------
 .../tagdict/POSTaggerFactoryTest.java           |  6 ++-
 4 files changed, 63 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
index b88cc5d..4ee8cd4 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
 
 import java.io.File;
 
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.EncodingParameter;
 
@@ -30,7 +31,15 @@ interface XMLDictionaryToTableParams extends EncodingParameter {
   @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
   File getInputFile();
 
-  @ParameterDescription(valueName = "out", description = "Tab separated format.")
+  @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
   File getOutputFile();
 
+  @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+  @OptionalParameter(defaultValue=",")
+  String getSeparator();
+  
+  @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+  @OptionalParameter(defaultValue="prefix")
+  String getEncoder();
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index c87f016..0e7f2d5 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -23,8 +23,11 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Iterator;
+import java.util.Properties;
 
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.tools.cmdline.BasicCmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
@@ -35,6 +38,8 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
   interface Params extends XMLDictionaryToTableParams {
   }
 
+  private String SEPARATOR;
+
   public String getShortDescription() {
     return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
   }
@@ -49,6 +54,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
     File dictInFile = params.getInputFile();
     File dictOutFile = params.getOutputFile();
     Charset encoding = params.getEncoding();
+    SEPARATOR = params.getSeparator();
 
     CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
     CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
@@ -66,17 +72,56 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
         encoding)) {
       while (iterator.hasNext()) {
         String word = iterator.next();
-        String wordAndLemma = word + "\t\t"; // lemma is empty
         for (String tag : tagDictionary.getTags(word)) {
-          writer.write(wordAndLemma + tag);
-          writer.newLine();
+          if(valid(word,tag)) {
+            String entry = createEntry(word, tag);
+            writer.write(entry);
+            writer.newLine();
+          }
         }
       }
       writer.close();
+      System.out.println("Created dictionary: " + dictOutFile.toPath());
     } catch (IOException e) {
       throw new TerminateToolException(-1, "Error while writing output: "
           + e.getMessage(), e);
     }
+    
+    Properties info = new Properties();
+    info.setProperty("fsa.dict.separator", SEPARATOR);
+    info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+    info.setProperty("fsa.dict.encoder", params.getEncoder());
+    
+    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+    
+    try {
+      info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing metadata output: "
+          + e.getMessage(), e);
+    }
+    System.out.println("Created metadata: " + dictOutFile.toPath());
+    
+  }
+
+  private boolean valid(String word, String tag) {
+    if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+      System.out
+          .println("Warn: invalid entry because contains separator - word: "
+              + word + " tag: " + tag);
+      return false;
+    }
+    
+    return true;
+  }
+
+  private String createEntry(String word, String tag) {
+    
+    String entry = "" + SEPARATOR +// base
+        word + SEPARATOR +
+        tag;
+        
+    return entry;
   }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index dcb6554..93d6c61 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,8 +17,6 @@
 
 package opennlp.morfologik.tagdict;
 
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
@@ -27,7 +25,6 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Map;
 
 import morfologik.stemming.DictionaryMetadata;
@@ -81,29 +78,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
     super.init(ngramDictionary, null);
     this.dict = posDictionary;
-
-    // get the dictionary path
-    String path = System.getProperty("morfologik.dict");
-    if (path == null) {
-      throw new IllegalArgumentException(
-          "The property fsa.dict is missing! -Dmorfologik.dict=path");
-    }
-
-    // now we try to load it...
-    try {
-      this.dictData = Files.readAllBytes(Paths.get(path));
-      this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
-          .toPath());
-
-      this.dict = createMorfologikDictionary(dictData, dictInfo);
-
-    } catch (IllegalArgumentException e) {
-      throw new IllegalArgumentException(
-          "The file is not a Morfologik dictionary!", e);
-    } catch (IOException e) {
-      throw new IllegalArgumentException(
-          "Could not open the Morfologik dictionary or the .info file", e);
-    }
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 9233979..7341a02 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,7 +17,7 @@
 
 package opennlp.morfologik.tagdict;
 
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -71,6 +71,8 @@ public class POSTaggerFactoryTest {
     POSTaggerFactory factory = posModel.getFactory();
     assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
 
+    factory = null;
+    
     ByteArrayOutputStream out = new ByteArrayOutputStream();
     posModel.serialize(out);
     ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
@@ -79,6 +81,8 @@ public class POSTaggerFactoryTest {
 
     factory = fromSerialized.getFactory();
     assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+    
+    assertEquals(2, factory.getTagDictionary().getTags("casa").length);
   }
 
 }
\ No newline at end of file


[10/16] opennlp git commit: OPENNLP-622 Added a different OpenNLP CLI loader that includes all jars in lib folder to classpath.

Posted by co...@apache.org.
OPENNLP-622 Added a different OpenNLP CLI loader that includes all jars in lib folder to classpath.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60a3b24f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60a3b24f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60a3b24f

Branch: refs/heads/trunk
Commit: 60a3b24f186cc12ee9f053d3530055933bb2a3d9
Parents: be7e6ba
Author: William Colen <co...@apache.org>
Authored: Thu Jul 14 21:36:48 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Thu Jul 14 21:36:48 2016 +0000

----------------------------------------------------------------------
 src/main/bin/opennlp-cp | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/60a3b24f/src/main/bin/opennlp-cp
----------------------------------------------------------------------
diff --git a/src/main/bin/opennlp-cp b/src/main/bin/opennlp-cp
new file mode 100755
index 0000000..dff0d12
--- /dev/null
+++ b/src/main/bin/opennlp-cp
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+#   Licensed to the Apache Software Foundation (ASF) under one
+#   or more contributor license agreements.  See the NOTICE file
+#   distributed with this work for additional information
+#   regarding copyright ownership.  The ASF licenses this file
+#   to you under the Apache License, Version 2.0 (the
+#   "License"); you may not use this file except in compliance
+#   with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing,
+#   software distributed under the License is distributed on an
+#   #  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#   KIND, either express or implied.  See the License for the
+#   specific language governing permissions and limitations
+#   under the License.
+
+# Note:  Do not output anything in this script file, any output
+#        may be inadvertantly placed in any output files if
+#        output redirection is used.
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    JAVACMD="$JAVA_HOME/bin/java"
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+# Might fail if $0 is a link
+OPENNLP_HOME=`dirname "$0"`/..
+
+$JAVACMD -Xmx1024m -cp "lib/*" opennlp.tools.cmdline.CLI $@


[05/16] opennlp git commit: OPENNLP-622 Fixed PosTaggerFactory and restored test.

Posted by co...@apache.org.
OPENNLP-622 Fixed PosTaggerFactory and restored test.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ceb5540
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ceb5540
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ceb5540

Branch: refs/heads/trunk
Commit: 3ceb5540ced842875c010bb81169afcb544f203e
Parents: 1314887
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 03:52:14 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 03:52:14 2016 +0000

----------------------------------------------------------------------
 .../tagdict/MorfologikPOSTaggerFactory.java     |  46 +++--
 .../tagdict/POSTaggerFactoryTest.java           | 192 ++++++++-----------
 2 files changed, 106 insertions(+), 132 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index 723b1ce..dcb6554 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -26,9 +26,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Map;
 
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.postag.TagDictionary;
@@ -53,23 +55,27 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
 
   public MorfologikPOSTaggerFactory() {
   }
-
-  /**
-   * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}.
-   * 
-   * @param ngramDictionary a ngramDictionary 
-   * @param morfologikDictionary a Morfologik dictionary
-   * @param morfologikDictionaryMetadata the dictionary metadata
-   * @throws IOException invalid Morfologik dictionary
-   */
-  public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
-      byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException {
-    super(ngramDictionary, null);
-    this.dictData = morfologikDictionary;
-    this.dictInfo = morfologikDictionaryMetadata;
+  
+  public TagDictionary createTagDictionary(File dictionary)
+      throws InvalidFormatException, FileNotFoundException, IOException {
+    
+    if(!dictionary.canRead()) {
+      throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath());
+    }
+    
+    Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath());
+    
+    if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) {
+      throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName());
+    }
+    
+    this.dictData = Files.readAllBytes(dictionary.toPath());
+    this.dictInfo = Files.readAllBytes(dictionaryMeta);
+    
+    return createMorfologikDictionary(dictData, dictInfo);
     
-    this.dict = createMorfologikDictionary(dictData, dictInfo);
   }
+  
 
   @Override
   protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
@@ -130,8 +136,7 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
 
   @Override
   public void setTagDictionary(TagDictionary dictionary) {
-    throw new UnsupportedOperationException(
-        "Morfologik POS Tagger factory does not support this operation");
+    this.dict = dictionary;
   }
 
   @Override
@@ -141,13 +146,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   }
 
   @Override
-  public TagDictionary createTagDictionary(File dictionary)
-      throws InvalidFormatException, FileNotFoundException, IOException {
-    throw new UnsupportedOperationException(
-        "Morfologik POS Tagger factory does not support this operation");
-  }
-
-  @Override
   public TagDictionary createTagDictionary(InputStream in)
       throws InvalidFormatException, IOException {
     throw new UnsupportedOperationException(

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 6c6814b..9233979 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -1,108 +1,84 @@
-///*
-// * Licensed to the Apache Software Foundation (ASF) under one or more
-// * contributor license agreements.  See the NOTICE file distributed with
-// * this work for additional information regarding copyright ownership.
-// * The ASF licenses this file to You under the Apache License, Version 2.0
-// * (the "License"); you may not use this file except in compliance with
-// * the License. You may obtain a copy of the License at
-// *
-// *     http://www.apache.org/licenses/LICENSE-2.0
-// *
-// * Unless required by applicable law or agreed to in writing, software
-// * distributed under the License is distributed on an "AS IS" BASIS,
-// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// * See the License for the specific language governing permissions and
-// * limitations under the License.
-// */
-//
-//package opennlp.morfologik.tagdict;
-//
-//import static org.junit.Assert.assertTrue;
-//
-//import java.io.ByteArrayInputStream;
-//import java.io.ByteArrayOutputStream;
-//import java.io.File;
-//import java.io.IOException;
-//import java.io.InputStream;
-//import java.io.InputStreamReader;
-//import java.nio.charset.Charset;
-//import java.nio.file.Files;
-//import java.nio.file.Path;
-//import java.nio.file.Paths;
-//
-//import morfologik.stemming.DictionaryMetadata;
-//import morfologik.stemming.EncoderType;
-//import opennlp.morfologik.builder.MorfologikDictionayBuilder;
-//import opennlp.morfologik.builder.POSDictionayBuilderTest;
-//import opennlp.tools.dictionary.Dictionary;
-//import opennlp.tools.postag.DefaultPOSSequenceValidator;
-//import opennlp.tools.postag.POSContextGenerator;
-//import opennlp.tools.postag.POSDictionary;
-//import opennlp.tools.postag.POSModel;
-//import opennlp.tools.postag.POSSample;
-//import opennlp.tools.postag.POSTaggerFactory;
-//import opennlp.tools.postag.POSTaggerME;
-//import opennlp.tools.postag.WordTagSampleStream;
-//import opennlp.tools.util.BaseToolFactory;
-//import opennlp.tools.util.InvalidFormatException;
-//import opennlp.tools.util.ObjectStream;
-//import opennlp.tools.util.TrainingParameters;
-//import opennlp.tools.util.model.ModelType;
-//
-//import org.junit.Test;
-//
-///**
-// * Tests for the {@link POSTaggerFactory} class.
-// */
-//public class POSTaggerFactoryTest {
-//
-//  private static ObjectStream<POSSample> createSampleStream()
-//      throws IOException {
-//    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
-//        .getResourceAsStream("AnnotatedSentences.txt");
-//
-//    return new WordTagSampleStream((new InputStreamReader(in)));
-//  }
-//
-//  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
-//      throws IOException {
-//    return POSTaggerME.train("en", createSampleStream(),
-//        TrainingParameters.defaultParams(), factory);
-//  }
-//
-//  @Test
-//  public void testPOSTaggerWithCustomFactory() throws Exception {
-//
-//    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
-//    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
-//        "/dictionaryWithLemma.txt").getFile());
-//
-//    File dictOutFile = File.createTempFile(
-//        POSDictionayBuilderTest.class.getName(), ".dict");
-//
-//    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+",
-//        EncoderType.PREFIX);
-//
-//    Path dictPath = dictOutFile.toPath();
-//    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath);
-//
-//    byte[] dic = Files.readAllBytes(dictPath);
-//    byte[] meta = Files.readAllBytes(metaPath);
-//
-//    POSModel posModel = trainPOSModel(ModelType.MAXENT,
-//        new MorfologikPOSTaggerFactory(null, dic, meta));
-//
-//    POSTaggerFactory factory = posModel.getFactory();
-//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
-//
-//    ByteArrayOutputStream out = new ByteArrayOutputStream();
-//    posModel.serialize(out);
-//    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
-//
-//    POSModel fromSerialized = new POSModel(in);
-//
-//    factory = fromSerialized.getFactory();
-//    assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory);
-//  }
-//
-//}
\ No newline at end of file
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Path;
+
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.postag.WordTagSampleStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelType;
+
+import org.junit.Test;
+
+/**
+ * Tests for the {@link POSTaggerFactory} class.
+ */
+public class POSTaggerFactoryTest {
+
+  private static ObjectStream<POSSample> createSampleStream()
+      throws IOException {
+    InputStream in = POSTaggerFactoryTest.class.getClassLoader()
+        .getResourceAsStream("AnnotatedSentences.txt");
+
+    return new WordTagSampleStream((new InputStreamReader(in)));
+  }
+
+  static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory)
+      throws IOException {
+    return POSTaggerME.train("en", createSampleStream(),
+        TrainingParameters.defaultParams(), factory);
+  }
+
+  @Test
+  public void testPOSTaggerWithCustomFactory() throws Exception {
+
+    Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary();
+    POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory();
+    TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile());
+    inFactory.setTagDictionary(inDict);
+
+    POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory);
+
+    POSTaggerFactory factory = posModel.getFactory();
+    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    posModel.serialize(out);
+    ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
+
+    POSModel fromSerialized = new POSModel(in);
+
+    factory = fromSerialized.getFactory();
+    assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+  }
+
+}
\ No newline at end of file