You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2013/12/02 14:23:05 UTC

svn commit: r1547014 - in /opennlp/addons/morfologik-addon: ./ src/main/java/opennlp/morfologik/builder/ src/main/java/opennlp/morfologik/cmdline/ src/main/java/opennlp/morfologik/cmdline/builder/ src/main/java/opennlp/morfologik/tagdict/ src/test/ src...

Author: colen
Date: Mon Dec  2 13:23:04 2013
New Revision: 1547014

URL: http://svn.apache.org/r1547014
Log:
OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger models.

Added:
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java   (with props)
    opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java   (with props)
    opennlp/addons/morfologik-addon/src/test/
    opennlp/addons/morfologik-addon/src/test/java/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java   (with props)
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java   (with props)
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/
    opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java   (with props)
    opennlp/addons/morfologik-addon/src/test/resources/
    opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt   (with props)
Modified:
    opennlp/addons/morfologik-addon/pom.xml

Modified: opennlp/addons/morfologik-addon/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/pom.xml?rev=1547014&r1=1547013&r2=1547014&view=diff
==============================================================================
--- opennlp/addons/morfologik-addon/pom.xml (original)
+++ opennlp/addons/morfologik-addon/pom.xml Mon Dec  2 13:23:04 2013
@@ -33,6 +33,12 @@
       <version>1.6.0</version>
       <scope>compile</scope>
     </dependency>
+   <dependency>
+      <groupId>org.carrot2</groupId>
+      <artifactId>morfologik-tools</artifactId>
+      <version>1.6.0</version>
+      <scope>compile</scope>
+    </dependency>
 
     <dependency>
       <groupId>org.apache.opennlp</groupId>
@@ -40,11 +46,12 @@
       <version>1.6.0-SNAPSHOT</version>
     </dependency>
 
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>3.8.1</version>
-      <scope>test</scope>
-    </dependency>
+	<dependency>
+		<groupId>junit</groupId>
+		<artifactId>junit</artifactId>
+		<version>4.8.1</version>
+		<scope>test</scope>
+	</dependency>
+
   </dependencies>
 </project>

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import morfologik.stemming.Dictionary;
+import morfologik.tools.FSABuildTool;
+import morfologik.tools.Launcher;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+  /**
+   * Build a Morfologik binary dictionary
+   *
+   * @param dictInFile
+   *          the 3 column TSV dictionary file
+   * @param dictOutFile
+   *          where to store the binary Morfologik dictionary
+   * @param encoding
+   *          the encoding to be used while reading and writing
+   * @param separator
+   *          a field separator, the default is '+'. If your tags contains '+'
+   *          change to something else
+   * @param isUsePrefixes
+   *          if to compact using prefixes
+   * @param isUseInfixes
+   *          if to compact using infixes
+   * @throws Exception
+   */
+  public void build(File dictInFile, File dictOutFile, Charset encoding,
+      String separator, boolean isUsePrefixes, boolean isUseInfixes)
+      throws Exception {
+
+    File propertiesFile = new File(
+        Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
+    this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
+        isUsePrefixes, isUseInfixes);
+  }
+
+  /**
+   * Build a Morfologik binary dictionary
+   *
+   * @param dictInFile
+   *          the 3 column TSV dictionary file
+   * @param dictOutFile
+   *          where to store the binary Morfologik dictionary
+   * @param propertiesOutFile
+   *          where to store the properties of the Morfologik dictionary
+   * @param encoding
+   *          the encoding to be used while reading and writing
+   * @param separator
+   *          a field separator, the default is '+'. If your tags contains '+'
+   *          change to something else
+   * @param isUsePrefixes
+   *          if to compact using prefixes
+   * @param isUseInfixes
+   *          if to compact using infixes
+   * @throws Exception
+   */
+  public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
+      Charset encoding, String separator, boolean isUsePrefixes,
+      boolean isUseInfixes) throws Exception {
+
+    // we need to execute tab2morph followed by fsa_build
+
+    File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+
+    fsaBuild(morph, dictOutFile);
+
+    morph.delete();
+
+    // now we create the properties files using the passed parameters
+    createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+        propertiesOutFile);
+  }
+
+  void createProperties(Charset encoding, String separator,
+      boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+      throws FileNotFoundException, IOException {
+
+    Properties properties = new Properties();
+    properties.setProperty("fsa.dict.separator", separator);
+    properties.setProperty("fsa.dict.encoding", encoding.name());
+    properties.setProperty("fsa.dict.uses-prefixes",
+        Boolean.toString(isUsePrefixes));
+    properties.setProperty("fsa.dict.uses-infixes",
+        Boolean.toString(isUseInfixes));
+
+    OutputStream os = new FileOutputStream(propertiesFile);
+    properties.store(os, "Morfologik POS Dictionary properties");
+    os.close();
+
+  }
+
+  private void fsaBuild(File morph, File dictOutFile) throws Exception {
+    String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
+        dictOutFile.getAbsolutePath() };
+    FSABuildTool.main(params);
+  }
+
+  private File tab2morph(File dictInFile, String separator,
+      boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+
+    // create tab2morph parameters
+    List<String> tag2morphParams = new ArrayList<String>();
+    tag2morphParams.add("tab2morph");
+
+    tag2morphParams.add("--annotation");
+    tag2morphParams.add(separator);
+
+    if (isUsePrefixes) {
+      tag2morphParams.add("-pre");
+    }
+
+    if (isUseInfixes) {
+      tag2morphParams.add("-inf");
+    }
+
+    tag2morphParams.add("-i");
+    tag2morphParams.add(dictInFile.getAbsolutePath());
+
+    // we need a temporary file to store the intermediate output
+    File tmp = File.createTempFile("tab2morph", ".txt");
+    tmp.deleteOnExit();
+
+    tag2morphParams.add("-o");
+    tag2morphParams.add(tmp.getAbsolutePath());
+
+    Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
+
+    return tmp;
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+	public static final String CMD = "opennlp-morfologik-addon";
+
+	private static Map<String, CmdLineTool> toolLookupMap;
+
+	static {
+		toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+		List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+		tools.add(new MorfologikDictionaryBuilderTool());
+		tools.add(new XMLDictionaryToTableTool());
+
+		for (CmdLineTool tool : tools) {
+			toolLookupMap.put(tool.getName(), tool);
+		}
+
+		toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+	}
+
+	/**
+	 * @return a set which contains all tool names
+	 */
+	public static Set<String> getToolNames() {
+		return toolLookupMap.keySet();
+	}
+
+	private static void usage() {
+		System.out.print("OpenNLP Morfologik Addon "
+				+ Version.currentVersion().toString() + ". ");
+		System.out.println("Usage: " + CMD + " TOOL");
+		System.out.println("where TOOL is one of:");
+
+		// distance of tool name from line start
+		int numberOfSpaces = -1;
+		for (String toolName : toolLookupMap.keySet()) {
+			if (toolName.length() > numberOfSpaces) {
+				numberOfSpaces = toolName.length();
+			}
+		}
+		numberOfSpaces = numberOfSpaces + 4;
+
+		for (CmdLineTool tool : toolLookupMap.values()) {
+
+			System.out.print("  " + tool.getName());
+
+			for (int i = 0; i < Math.abs(tool.getName().length()
+					- numberOfSpaces); i++) {
+				System.out.print(" ");
+			}
+
+			System.out.println(tool.getShortDescription());
+		}
+
+		System.out
+				.println("All tools print help when invoked with help parameter");
+		System.out
+				.println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+	}
+
+  public static void main(String[] args) {
+
+		if (args.length == 0) {
+			usage();
+			System.exit(0);
+		}
+
+		String toolArguments[] = new String[args.length - 1];
+		System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+		String toolName = args[0];
+
+		// check for format
+		String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+		int idx = toolName.indexOf(".");
+		if (-1 < idx) {
+			formatName = toolName.substring(idx + 1);
+			toolName = toolName.substring(0, idx);
+		}
+		CmdLineTool tool = toolLookupMap.get(toolName);
+
+		try {
+			if (null == tool) {
+				throw new TerminateToolException(1, "Tool " + toolName
+						+ " is not found.");
+			}
+
+			if ((0 == toolArguments.length && tool.hasParams())
+					|| 0 < toolArguments.length
+					&& "help".equals(toolArguments[0])) {
+				if (tool instanceof TypedCmdLineTool) {
+					System.out.println(((TypedCmdLineTool) tool)
+							.getHelp(formatName));
+				} else if (tool instanceof BasicCmdLineTool) {
+					System.out.println(tool.getHelp());
+				}
+
+				System.exit(0);
+			}
+
+			if (tool instanceof TypedCmdLineTool) {
+				((TypedCmdLineTool) tool).run(formatName, toolArguments);
+			} else if (tool instanceof BasicCmdLineTool) {
+				if (-1 == idx) {
+					((BasicCmdLineTool) tool).run(toolArguments);
+				} else {
+					throw new TerminateToolException(1, "Tool " + toolName
+							+ " does not support formats.");
+				}
+			} else {
+				throw new TerminateToolException(1, "Tool " + toolName
+						+ " is not supported.");
+			}
+		} catch (TerminateToolException e) {
+
+			if (e.getMessage() != null) {
+				System.err.println(e.getMessage());
+			}
+
+			if (e.getCause() != null) {
+				System.err.println(e.getCause().getMessage());
+				e.getCause().printStackTrace(System.err);
+			}
+
+			System.exit(e.getCode());
+		}
+	}
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+  File getInputFile();
+
+  @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
+  File getOutputFile();
+
+  @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
+  @OptionalParameter(defaultValue = "+")
+  String getFSADictSeparator();
+
+  @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUsesPrefixes();
+
+  @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getUsesInfixes();
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+  interface Params extends MorfologikDictionaryBuilderParams {
+  }
+
+  public String getShortDescription() {
+    return "builds a binary POS Dictionary using Morfologik";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+    File dictOutFile = params.getOutputFile();
+    File propertiesFile = getExpectedPropertiesFile(dictOutFile);
+    Charset encoding = params.getEncoding();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+    CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    try {
+      builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
+          params.getFSADictSeparator(), params.getUsesPrefixes(),
+          params.getUsesInfixes());
+    } catch (Exception e) {
+      throw new TerminateToolException(-1,
+          "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+    }
+
+  }
+
+  private File getExpectedPropertiesFile(File dictFile) {
+    return new File(Dictionary.getExpectedFeaturesName(dictFile
+        .getAbsolutePath()));
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+  @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+  File getInputFile();
+
+  @ParameterDescription(valueName = "out", description = "Tab separated format.")
+  File getOutputFile();
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.util.Iterator;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+  interface Params extends XMLDictionaryToTableParams {
+  }
+
+  public String getShortDescription() {
+    return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+  }
+
+  public String getHelp() {
+    return getBasicHelp(Params.class);
+  }
+
+  public void run(String[] args) {
+    Params params = validateAndParseParams(args, Params.class);
+
+    File dictInFile = params.getInputFile();
+    File dictOutFile = params.getOutputFile();
+    Charset encoding = params.getEncoding();
+
+    CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+    CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+    POSDictionary tagDictionary = null;
+    try {
+      tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+    }
+    Iterator<String> iterator = tagDictionary.iterator();
+
+    try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+        encoding)) {
+      while (iterator.hasNext()) {
+        String word = iterator.next();
+        String wordAndLemma = word + "\t\t"; // lemma is empty
+        for (String tag : tagDictionary.getTags(word)) {
+          writer.write(wordAndLemma + tag);
+          writer.newLine();
+        }
+      }
+      writer.close();
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing output: "
+          + e.getMessage(), e);
+    }
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+  private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+  private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+  private static final String MORFOLOGIK_POSDICT = "tagdict."
+      + MORFOLOGIK_POSDICT_SUF;
+  private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+      + MORFOLOGIK_DICT_INFO_SUF;
+
+  private TagDictionary dict;
+
+  private byte[] dictInfo;
+  private byte[] dictData;
+
+  public MorfologikPOSTaggerFactory() {
+  }
+
+  public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
+      TagDictionary posDictionary) {
+    super(ngramDictionary, null);
+  }
+
+  @Override
+  protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+    super.init(ngramDictionary, null);
+    this.dict = posDictionary;
+
+    // get the dictionary path
+    String path = System.getProperty("morfologik.dict");
+    if (path == null) {
+      throw new IllegalArgumentException(
+          "The property fsa.dict is missing! -Dmorfologik.dict=path");
+    }
+
+    // now we try to load it...
+    try {
+      this.dictData = Files.readAllBytes(Paths.get(path));
+      this.dictInfo = Files.readAllBytes(Paths
+          .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+
+      this.dict = createMorfologikDictionary(dictData, dictInfo);
+
+    } catch (IllegalArgumentException e) {
+      throw new IllegalArgumentException(
+          "The file is not a Morfologik dictionary!", e);
+    } catch (IOException e) {
+      throw new IllegalArgumentException(
+          "Could not open the Morfologik dictionary or the .info file", e);
+    }
+  }
+
+  @Override
+  public TagDictionary getTagDictionary() {
+    if (this.dict == null) {
+
+      if (artifactProvider != null) {
+        Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+        if (obj != null) {
+          byte[] data = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_POSDICT);
+          byte[] info = (byte[]) artifactProvider
+              .getArtifact(MORFOLOGIK_DICT_INFO);
+
+          try {
+            this.dict = createMorfologikDictionary(data, info);
+          } catch (IllegalArgumentException e) {
+            throw new RuntimeException(
+                "Could not load the dictionary files to Morfologik.", e);
+          } catch (IOException e) {
+            throw new RuntimeException(
+                "IO error while reading the Morfologik dictionary files.", e);
+          }
+        }
+      }
+    }
+
+    return this.dict;
+  }
+
+  @Override
+  public void setTagDictionary(TagDictionary dictionary) {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createEmptyTagDictionary() {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createTagDictionary(File dictionary)
+      throws InvalidFormatException, FileNotFoundException, IOException {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  public TagDictionary createTagDictionary(InputStream in)
+      throws InvalidFormatException, IOException {
+    throw new UnsupportedOperationException(
+        "Morfologik POS Tagger factory does not support this operation");
+  }
+
+  @Override
+  @SuppressWarnings("rawtypes")
+  public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+    Map<String, ArtifactSerializer> serializers = super
+        .createArtifactSerializersMap();
+
+    serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+    serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+    return serializers;
+  }
+
+  @Override
+  public Map<String, Object> createArtifactMap() {
+    Map<String, Object> artifactMap = super.createArtifactMap();
+    artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+    artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+    return artifactMap;
+  }
+
+  private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+      throws IOException {
+    morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+        .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+            info));
+    return new MorfologikTagDictionary(dict);
+  }
+
+  static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+    public byte[] create(InputStream in) throws IOException,
+        InvalidFormatException {
+
+      return ModelUtil.read(in);
+    }
+
+    public void serialize(byte[] artifact, OutputStream out) throws IOException {
+      out.write(artifact);
+    }
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+  private IStemmer dictLookup;
+  private boolean isCaseSensitive;
+
+  /**
+   * Creates a case sensitive {@link MorfologikTagDictionary}
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict)
+      throws IllegalArgumentException, IOException {
+    this(dict, true);
+  }
+
+  /**
+   * Creates MorfologikLemmatizer
+   *
+   * @param dict
+   *          a Morfologik FSA dictionary
+   * @param caseSensitive
+   *          if true it performs case sensitive lookup
+   * @throws IllegalArgumentException
+   *           if FSA's root node cannot be acquired (dictionary is empty).
+   * @throws IOException
+   *           could not read dictionary from dictURL
+   */
+  public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+      throws IllegalArgumentException, IOException {
+    this.dictLookup = new DictionaryLookup(dict);
+    this.isCaseSensitive = caseSensitive;
+  }
+
+  @Override
+  public String[] getTags(String word) {
+    if (!isCaseSensitive) {
+      word = word.toLowerCase();
+    }
+
+    List<WordData> data = dictLookup.lookup(word);
+    if (data != null && data.size() > 0) {
+      List<String> tags = new ArrayList<String>(data.size());
+      for (int i = 0; i < data.size(); i++) {
+        tags.add(data.get(i).getTag().toString());
+      }
+      if (tags.size() > 0)
+        return tags.toArray(new String[tags.size()]);
+      return null;
+    }
+    return null;
+  }
+}

Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Properties;
+
+import junit.framework.TestCase;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+  @Test
+  public void testBuildDictionary() throws Exception {
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+        .toURL());
+
+    assertNotNull(ml);
+  }
+
+  @Test
+  public void testPropertiesCreation() throws Exception {
+
+    Charset c = Charset.forName("iso-8859-1");
+    String sep = "_";
+    boolean pref = true;
+    boolean inf = true;
+    Properties p = createPropertiesHelper(c, sep, pref, inf);
+
+    assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
+    assertEquals(sep, p.getProperty("fsa.dict.separator"));
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+    pref = false;
+    inf = true;
+    p = createPropertiesHelper(c, sep, pref, inf);
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+    pref = true;
+    inf = false;
+    p = createPropertiesHelper(c, sep, pref, inf);
+    assertEquals(pref,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+    assertEquals(inf,
+        Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+  }
+
+  private Properties createPropertiesHelper(Charset c, String sep,
+      boolean pref, boolean inf) throws IOException {
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
+        ".info");
+    builder.createProperties(c, sep, pref, inf, f);
+
+    InputStream is = new FileInputStream(f);
+
+    Properties prop = new Properties();
+    prop.load(is);
+    is.close();
+    f.delete();
+    return prop;
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,46 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+  @Test
+  public void testLemmatizeInsensitive() throws Exception {
+    DictionaryLemmatizer dict = createDictionary(false);
+
+    assertEquals("casar", dict.lemmatize("casa", "V"));
+    assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+    assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+  }
+
+  private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+      throws Exception {
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+        .toURL());
+
+    return ml;
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java Mon Dec  2 13:23:04 2013
@@ -0,0 +1,92 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.morfologik.tagdict.MorfologikTagDictionary;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+  @Test
+  public void testNoLemma() throws Exception {
+    MorfologikTagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("carro"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("NOUN"));
+
+  }
+
+  @Test
+  public void testPOSDictionaryInsensitive() throws Exception {
+    TagDictionary dict = createDictionary(false);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+  }
+
+  @Test
+  public void testPOSDictionarySensitive() throws Exception {
+    TagDictionary dict = createDictionary(true);
+
+    List<String> tags = Arrays.asList(dict.getTags("casa"));
+    assertEquals(2, tags.size());
+    assertTrue(tags.contains("NOUN"));
+    assertTrue(tags.contains("V"));
+
+    // this is the behavior of case insensitive dictionary
+    // if we search it using case insensitive, Casa as a proper noun
+    // should be lower case in the dictionary
+    tags = Arrays.asList(dict.getTags("Casa"));
+    assertEquals(1, tags.size());
+    assertTrue(tags.contains("PROP"));
+
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+      throws Exception {
+    return this.createDictionary(caseSensitive, null);
+  }
+
+  private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+      List<String> constant) throws Exception {
+
+    MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+    File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+        "/dictionaryWithLemma.txt").getFile());
+
+    File dictOutFile = File.createTempFile(
+        POSDictionayBuilderTest.class.getName(), ".dict");
+
+    builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+        true);
+
+    MorfologikTagDictionary ml = new MorfologikTagDictionary(
+        Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+
+    return ml;
+  }
+
+}

Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt (added)
+++ opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt Mon Dec  2 13:23:04 2013
@@ -0,0 +1,10 @@
+casa	casa	NOUN
+casa	casar	V
+Casa	Casa	PROP
+casinha	casa	NOUN
+casona	casa	NOUN
+menina	menino	NOUN
+menino	menino	NOUN
+meninão	menino	NOUN
+menininho	menino	NOUN
+carro		NOUN

Propchange: opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain