You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2013/12/02 14:23:05 UTC
svn commit: r1547014 - in /opennlp/addons/morfologik-addon: ./
src/main/java/opennlp/morfologik/builder/
src/main/java/opennlp/morfologik/cmdline/
src/main/java/opennlp/morfologik/cmdline/builder/
src/main/java/opennlp/morfologik/tagdict/ src/test/ src...
Author: colen
Date: Mon Dec 2 13:23:04 2013
New Revision: 1547014
URL: http://svn.apache.org/r1547014
Log:
OPENNLP-622 Added code to create Morfologik data from TSV or OpenNLP XML tag dictionaries. Created a TagDictionary implementation using Morfologik. Added a POSTaggerFactory to bundle the Morfologik dictionaries in POS Tagger models.
Added:
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java (with props)
opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java (with props)
opennlp/addons/morfologik-addon/src/test/
opennlp/addons/morfologik-addon/src/test/java/
opennlp/addons/morfologik-addon/src/test/java/opennlp/
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java (with props)
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java (with props)
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/
opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java (with props)
opennlp/addons/morfologik-addon/src/test/resources/
opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt (with props)
Modified:
opennlp/addons/morfologik-addon/pom.xml
Modified: opennlp/addons/morfologik-addon/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/pom.xml?rev=1547014&r1=1547013&r2=1547014&view=diff
==============================================================================
--- opennlp/addons/morfologik-addon/pom.xml (original)
+++ opennlp/addons/morfologik-addon/pom.xml Mon Dec 2 13:23:04 2013
@@ -33,6 +33,12 @@
<version>1.6.0</version>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.carrot2</groupId>
+ <artifactId>morfologik-tools</artifactId>
+ <version>1.6.0</version>
+ <scope>compile</scope>
+ </dependency>
<dependency>
<groupId>org.apache.opennlp</groupId>
@@ -40,11 +46,12 @@
<version>1.6.0-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>3.8.1</version>
- <scope>test</scope>
- </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+
</dependencies>
</project>
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+import morfologik.stemming.Dictionary;
+import morfologik.tools.FSABuildTool;
+import morfologik.tools.Launcher;
+
+/**
+ * Utility class to build Morfologik dictionaries from a tab separated values
+ * file. The first column is the word, the second its lemma and the third a POS
+ * tag. If there is no lemma information leave the second column empty.
+ */
+public class MorfologikDictionayBuilder {
+
+ /**
+ * Build a Morfologik binary dictionary
+ *
+ * @param dictInFile
+ * the 3 column TSV dictionary file
+ * @param dictOutFile
+ * where to store the binary Morfologik dictionary
+ * @param encoding
+ * the encoding to be used while reading and writing
+ * @param separator
+ * a field separator, the default is '+'. If your tags contains '+'
+ * change to something else
+ * @param isUsePrefixes
+ * if to compact using prefixes
+ * @param isUseInfixes
+ * if to compact using infixes
+ * @throws Exception
+ */
+ public void build(File dictInFile, File dictOutFile, Charset encoding,
+ String separator, boolean isUsePrefixes, boolean isUseInfixes)
+ throws Exception {
+
+ File propertiesFile = new File(
+ Dictionary.getExpectedFeaturesName(dictOutFile.getAbsolutePath()));
+ this.build(dictInFile, dictOutFile, propertiesFile, encoding, separator,
+ isUsePrefixes, isUseInfixes);
+ }
+
+ /**
+ * Build a Morfologik binary dictionary
+ *
+ * @param dictInFile
+ * the 3 column TSV dictionary file
+ * @param dictOutFile
+ * where to store the binary Morfologik dictionary
+ * @param propertiesOutFile
+ * where to store the properties of the Morfologik dictionary
+ * @param encoding
+ * the encoding to be used while reading and writing
+ * @param separator
+ * a field separator, the default is '+'. If your tags contains '+'
+ * change to something else
+ * @param isUsePrefixes
+ * if to compact using prefixes
+ * @param isUseInfixes
+ * if to compact using infixes
+ * @throws Exception
+ */
+ public void build(File dictInFile, File dictOutFile, File propertiesOutFile,
+ Charset encoding, String separator, boolean isUsePrefixes,
+ boolean isUseInfixes) throws Exception {
+
+ // we need to execute tab2morph followed by fsa_build
+
+ File morph = tab2morph(dictInFile, separator, isUsePrefixes, isUseInfixes);
+
+ fsaBuild(morph, dictOutFile);
+
+ morph.delete();
+
+ // now we create the properties files using the passed parameters
+ createProperties(encoding, separator, isUsePrefixes, isUseInfixes,
+ propertiesOutFile);
+ }
+
+ void createProperties(Charset encoding, String separator,
+ boolean isUsePrefixes, boolean isUseInfixes, File propertiesFile)
+ throws FileNotFoundException, IOException {
+
+ Properties properties = new Properties();
+ properties.setProperty("fsa.dict.separator", separator);
+ properties.setProperty("fsa.dict.encoding", encoding.name());
+ properties.setProperty("fsa.dict.uses-prefixes",
+ Boolean.toString(isUsePrefixes));
+ properties.setProperty("fsa.dict.uses-infixes",
+ Boolean.toString(isUseInfixes));
+
+ OutputStream os = new FileOutputStream(propertiesFile);
+ properties.store(os, "Morfologik POS Dictionary properties");
+ os.close();
+
+ }
+
+ private void fsaBuild(File morph, File dictOutFile) throws Exception {
+ String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o",
+ dictOutFile.getAbsolutePath() };
+ FSABuildTool.main(params);
+ }
+
+ private File tab2morph(File dictInFile, String separator,
+ boolean isUsePrefixes, boolean isUseInfixes) throws Exception {
+
+ // create tab2morph parameters
+ List<String> tag2morphParams = new ArrayList<String>();
+ tag2morphParams.add("tab2morph");
+
+ tag2morphParams.add("--annotation");
+ tag2morphParams.add(separator);
+
+ if (isUsePrefixes) {
+ tag2morphParams.add("-pre");
+ }
+
+ if (isUseInfixes) {
+ tag2morphParams.add("-inf");
+ }
+
+ tag2morphParams.add("-i");
+ tag2morphParams.add(dictInFile.getAbsolutePath());
+
+ // we need a temporary file to store the intermediate output
+ File tmp = File.createTempFile("tab2morph", ".txt");
+ tmp.deleteOnExit();
+
+ tag2morphParams.add("-o");
+ tag2morphParams.add(tmp.getAbsolutePath());
+
+ Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()]));
+
+ return tmp;
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline;
+
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.morfologik.cmdline.builder.MorfologikDictionaryBuilderTool;
+import opennlp.morfologik.cmdline.builder.XMLDictionaryToTableTool;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TypedCmdLineTool;
+import opennlp.tools.util.Version;
+
+public final class CLI {
+
+ public static final String CMD = "opennlp-morfologik-addon";
+
+ private static Map<String, CmdLineTool> toolLookupMap;
+
+ static {
+ toolLookupMap = new LinkedHashMap<String, CmdLineTool>();
+
+ List<CmdLineTool> tools = new LinkedList<CmdLineTool>();
+
+ tools.add(new MorfologikDictionaryBuilderTool());
+ tools.add(new XMLDictionaryToTableTool());
+
+ for (CmdLineTool tool : tools) {
+ toolLookupMap.put(tool.getName(), tool);
+ }
+
+ toolLookupMap = Collections.unmodifiableMap(toolLookupMap);
+ }
+
+ /**
+ * @return a set which contains all tool names
+ */
+ public static Set<String> getToolNames() {
+ return toolLookupMap.keySet();
+ }
+
+ private static void usage() {
+ System.out.print("OpenNLP Morfologik Addon "
+ + Version.currentVersion().toString() + ". ");
+ System.out.println("Usage: " + CMD + " TOOL");
+ System.out.println("where TOOL is one of:");
+
+ // distance of tool name from line start
+ int numberOfSpaces = -1;
+ for (String toolName : toolLookupMap.keySet()) {
+ if (toolName.length() > numberOfSpaces) {
+ numberOfSpaces = toolName.length();
+ }
+ }
+ numberOfSpaces = numberOfSpaces + 4;
+
+ for (CmdLineTool tool : toolLookupMap.values()) {
+
+ System.out.print(" " + tool.getName());
+
+ for (int i = 0; i < Math.abs(tool.getName().length()
+ - numberOfSpaces); i++) {
+ System.out.print(" ");
+ }
+
+ System.out.println(tool.getShortDescription());
+ }
+
+ System.out
+ .println("All tools print help when invoked with help parameter");
+ System.out
+ .println("Example: opennlp-morfologik-addon POSDictionaryBuilder help");
+ }
+
+ public static void main(String[] args) {
+
+ if (args.length == 0) {
+ usage();
+ System.exit(0);
+ }
+
+ String toolArguments[] = new String[args.length - 1];
+ System.arraycopy(args, 1, toolArguments, 0, toolArguments.length);
+
+ String toolName = args[0];
+
+ // check for format
+ String formatName = StreamFactoryRegistry.DEFAULT_FORMAT;
+ int idx = toolName.indexOf(".");
+ if (-1 < idx) {
+ formatName = toolName.substring(idx + 1);
+ toolName = toolName.substring(0, idx);
+ }
+ CmdLineTool tool = toolLookupMap.get(toolName);
+
+ try {
+ if (null == tool) {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " is not found.");
+ }
+
+ if ((0 == toolArguments.length && tool.hasParams())
+ || 0 < toolArguments.length
+ && "help".equals(toolArguments[0])) {
+ if (tool instanceof TypedCmdLineTool) {
+ System.out.println(((TypedCmdLineTool) tool)
+ .getHelp(formatName));
+ } else if (tool instanceof BasicCmdLineTool) {
+ System.out.println(tool.getHelp());
+ }
+
+ System.exit(0);
+ }
+
+ if (tool instanceof TypedCmdLineTool) {
+ ((TypedCmdLineTool) tool).run(formatName, toolArguments);
+ } else if (tool instanceof BasicCmdLineTool) {
+ if (-1 == idx) {
+ ((BasicCmdLineTool) tool).run(toolArguments);
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " does not support formats.");
+ }
+ } else {
+ throw new TerminateToolException(1, "Tool " + toolName
+ + " is not supported.");
+ }
+ } catch (TerminateToolException e) {
+
+ if (e.getMessage() != null) {
+ System.err.println(e.getMessage());
+ }
+
+ if (e.getCause() != null) {
+ System.err.println(e.getCause().getMessage());
+ e.getCause().printStackTrace(System.err);
+ }
+
+ System.exit(e.getCode());
+ }
+ }
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/CLI.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface MorfologikDictionaryBuilderParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "Plain file with one entry per line")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "out", description = "The generated dictionary file.")
+ File getOutputFile();
+
+ @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.")
+ @OptionalParameter(defaultValue = "+")
+ String getFSADictSeparator();
+
+ @ParameterDescription(valueName = "true|false", description = "Compact using prefixes.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getUsesPrefixes();
+
+ @ParameterDescription(valueName = "true|false", description = "Compact using infixes.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getUsesInfixes();
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+
+public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool {
+
+ interface Params extends MorfologikDictionaryBuilderParams {
+ }
+
+ public String getShortDescription() {
+ return "builds a binary POS Dictionary using Morfologik";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+ File dictOutFile = params.getOutputFile();
+ File propertiesFile = getExpectedPropertiesFile(dictOutFile);
+ Charset encoding = params.getEncoding();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+ CmdLineUtil.checkOutputFile("properties output file", propertiesFile);
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ try {
+ builder.build(dictInFile, dictOutFile, propertiesFile, encoding,
+ params.getFSADictSeparator(), params.getUsesPrefixes(),
+ params.getUsesInfixes());
+ } catch (Exception e) {
+ throw new TerminateToolException(-1,
+ "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e);
+ }
+
+ }
+
+ private File getExpectedPropertiesFile(File dictFile) {
+ return new File(Dictionary.getExpectedFeaturesName(dictFile
+ .getAbsolutePath()));
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.params.EncodingParameter;
+
+/**
+ * Params for Dictionary tools.
+ */
+interface XMLDictionaryToTableParams extends EncodingParameter {
+
+ @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
+ File getInputFile();
+
+ @ParameterDescription(valueName = "out", description = "Tab separated format.")
+ File getOutputFile();
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.cmdline.builder;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.util.Iterator;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.postag.POSDictionary;
+
+public class XMLDictionaryToTableTool extends BasicCmdLineTool {
+
+ interface Params extends XMLDictionaryToTableParams {
+ }
+
+ public String getShortDescription() {
+ return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
+ }
+
+ public String getHelp() {
+ return getBasicHelp(Params.class);
+ }
+
+ public void run(String[] args) {
+ Params params = validateAndParseParams(args, Params.class);
+
+ File dictInFile = params.getInputFile();
+ File dictOutFile = params.getOutputFile();
+ Charset encoding = params.getEncoding();
+
+ CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
+ CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
+
+ POSDictionary tagDictionary = null;
+ try {
+ tagDictionary = POSDictionary.create(new FileInputStream(dictInFile));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "Error while loading XML POS Dictionay: " + e.getMessage(), e);
+ }
+ Iterator<String> iterator = tagDictionary.iterator();
+
+ try (BufferedWriter writer = Files.newBufferedWriter(dictOutFile.toPath(),
+ encoding)) {
+ while (iterator.hasNext()) {
+ String word = iterator.next();
+ String wordAndLemma = word + "\t\t"; // lemma is empty
+ for (String tag : tagDictionary.getTags(word)) {
+ writer.write(wordAndLemma + tag);
+ writer.newLine();
+ }
+ }
+ writer.close();
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing output: "
+ + e.getMessage(), e);
+ }
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Map;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSTaggerFactory;
+import opennlp.tools.postag.TagDictionary;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
+
+public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
+
+ private static final String MORFOLOGIK_POSDICT_SUF = "morfologik_dict";
+ private static final String MORFOLOGIK_DICT_INFO_SUF = "morfologik_info";
+
+ private static final String MORFOLOGIK_POSDICT = "tagdict."
+ + MORFOLOGIK_POSDICT_SUF;
+ private static final String MORFOLOGIK_DICT_INFO = "tagdict."
+ + MORFOLOGIK_DICT_INFO_SUF;
+
+ private TagDictionary dict;
+
+ private byte[] dictInfo;
+ private byte[] dictData;
+
+ public MorfologikPOSTaggerFactory() {
+ }
+
+ public MorfologikPOSTaggerFactory(Dictionary ngramDictionary,
+ TagDictionary posDictionary) {
+ super(ngramDictionary, null);
+ }
+
+ @Override
+ protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
+ super.init(ngramDictionary, null);
+ this.dict = posDictionary;
+
+ // get the dictionary path
+ String path = System.getProperty("morfologik.dict");
+ if (path == null) {
+ throw new IllegalArgumentException(
+ "The property fsa.dict is missing! -Dmorfologik.dict=path");
+ }
+
+ // now we try to load it...
+ try {
+ this.dictData = Files.readAllBytes(Paths.get(path));
+ this.dictInfo = Files.readAllBytes(Paths
+ .get(morfologik.stemming.Dictionary.getExpectedFeaturesName(path)));
+
+ this.dict = createMorfologikDictionary(dictData, dictInfo);
+
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException(
+ "The file is not a Morfologik dictionary!", e);
+ } catch (IOException e) {
+ throw new IllegalArgumentException(
+ "Could not open the Morfologik dictionary or the .info file", e);
+ }
+ }
+
+ @Override
+ public TagDictionary getTagDictionary() {
+ if (this.dict == null) {
+
+ if (artifactProvider != null) {
+ Object obj = artifactProvider.getArtifact(MORFOLOGIK_POSDICT);
+ if (obj != null) {
+ byte[] data = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_POSDICT);
+ byte[] info = (byte[]) artifactProvider
+ .getArtifact(MORFOLOGIK_DICT_INFO);
+
+ try {
+ this.dict = createMorfologikDictionary(data, info);
+ } catch (IllegalArgumentException e) {
+ throw new RuntimeException(
+ "Could not load the dictionary files to Morfologik.", e);
+ } catch (IOException e) {
+ throw new RuntimeException(
+ "IO error while reading the Morfologik dictionary files.", e);
+ }
+ }
+ }
+ }
+
+ return this.dict;
+ }
+
+ @Override
+ public void setTagDictionary(TagDictionary dictionary) {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createEmptyTagDictionary() {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createTagDictionary(File dictionary)
+ throws InvalidFormatException, FileNotFoundException, IOException {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ public TagDictionary createTagDictionary(InputStream in)
+ throws InvalidFormatException, IOException {
+ throw new UnsupportedOperationException(
+ "Morfologik POS Tagger factory does not support this operation");
+ }
+
+ @Override
+ @SuppressWarnings("rawtypes")
+ public Map<String, ArtifactSerializer> createArtifactSerializersMap() {
+ Map<String, ArtifactSerializer> serializers = super
+ .createArtifactSerializersMap();
+
+ serializers.put(MORFOLOGIK_POSDICT_SUF, new ByteArraySerializer());
+ serializers.put(MORFOLOGIK_DICT_INFO_SUF, new ByteArraySerializer());
+
+ return serializers;
+ }
+
+ @Override
+ public Map<String, Object> createArtifactMap() {
+ Map<String, Object> artifactMap = super.createArtifactMap();
+ artifactMap.put(MORFOLOGIK_POSDICT, this.dictData);
+ artifactMap.put(MORFOLOGIK_DICT_INFO, this.dictInfo);
+ return artifactMap;
+ }
+
+ private TagDictionary createMorfologikDictionary(byte[] data, byte[] info)
+ throws IOException {
+ morfologik.stemming.Dictionary dict = morfologik.stemming.Dictionary
+ .readAndClose(new ByteArrayInputStream(data), new ByteArrayInputStream(
+ info));
+ return new MorfologikTagDictionary(dict);
+ }
+
+ static class ByteArraySerializer implements ArtifactSerializer<byte[]> {
+
+ public byte[] create(InputStream in) throws IOException,
+ InvalidFormatException {
+
+ return ModelUtil.read(in);
+ }
+
+ public void serialize(byte[] artifact, OutputStream out) throws IOException {
+ out.write(artifact);
+ }
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java (added)
+++ opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.tagdict;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+import opennlp.tools.postag.TagDictionary;
+
+/**
+ * A POS Tagger dictionary implementation based on Morfologik binary
+ * dictionaries
+ */
+public class MorfologikTagDictionary implements TagDictionary {
+
+ private IStemmer dictLookup;
+ private boolean isCaseSensitive;
+
+ /**
+ * Creates a case sensitive {@link MorfologikTagDictionary}
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict)
+ throws IllegalArgumentException, IOException {
+ this(dict, true);
+ }
+
+ /**
+ * Creates MorfologikLemmatizer
+ *
+ * @param dict
+ * a Morfologik FSA dictionary
+ * @param caseSensitive
+ * if true it performs case sensitive lookup
+ * @throws IllegalArgumentException
+ * if FSA's root node cannot be acquired (dictionary is empty).
+ * @throws IOException
+ * could not read dictionary from dictURL
+ */
+ public MorfologikTagDictionary(Dictionary dict, boolean caseSensitive)
+ throws IllegalArgumentException, IOException {
+ this.dictLookup = new DictionaryLookup(dict);
+ this.isCaseSensitive = caseSensitive;
+ }
+
+ @Override
+ public String[] getTags(String word) {
+ if (!isCaseSensitive) {
+ word = word.toLowerCase();
+ }
+
+ List<WordData> data = dictLookup.lookup(word);
+ if (data != null && data.size() > 0) {
+ List<String> tags = new ArrayList<String>(data.size());
+ for (int i = 0; i < data.size(); i++) {
+ tags.add(data.get(i).getTag().toString());
+ }
+ if (tags.size() > 0)
+ return tags.toArray(new String[tags.size()]);
+ return null;
+ }
+ return null;
+ }
+}
Propchange: opennlp/addons/morfologik-addon/src/main/java/opennlp/morfologik/tagdict/MorfologikTagDictionary.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.builder;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Properties;
+
+import junit.framework.TestCase;
+import opennlp.morfologik.lemmatizer.MorfologikLemmatizer;
+
+import org.junit.Test;
+
+public class POSDictionayBuilderTest extends TestCase {
+
+ @Test
+ public void testBuildDictionary() throws Exception {
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+ .toURL());
+
+ assertNotNull(ml);
+ }
+
+ @Test
+ public void testPropertiesCreation() throws Exception {
+
+ Charset c = Charset.forName("iso-8859-1");
+ String sep = "_";
+ boolean pref = true;
+ boolean inf = true;
+ Properties p = createPropertiesHelper(c, sep, pref, inf);
+
+ assertEquals(c.name(), p.getProperty("fsa.dict.encoding"));
+ assertEquals(sep, p.getProperty("fsa.dict.separator"));
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+ pref = false;
+ inf = true;
+ p = createPropertiesHelper(c, sep, pref, inf);
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+
+ pref = true;
+ inf = false;
+ p = createPropertiesHelper(c, sep, pref, inf);
+ assertEquals(pref,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-prefixes")));
+ assertEquals(inf,
+ Boolean.parseBoolean(p.getProperty("fsa.dict.uses-infixes")));
+ }
+
+ private Properties createPropertiesHelper(Charset c, String sep,
+ boolean pref, boolean inf) throws IOException {
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File f = File.createTempFile(POSDictionayBuilderTest.class.getName(),
+ ".info");
+ builder.createProperties(c, sep, pref, inf, f);
+
+ InputStream is = new FileInputStream(f);
+
+ Properties prop = new Properties();
+ prop.load(is);
+ is.close();
+ f.delete();
+ return prop;
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,46 @@
+package opennlp.morfologik.lemmatizer;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.nio.charset.Charset;
+
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+
+import org.junit.Test;
+
+public class MorfologikLemmatizerTest {
+
+ @Test
+ public void testLemmatizeInsensitive() throws Exception {
+ DictionaryLemmatizer dict = createDictionary(false);
+
+ assertEquals("casar", dict.lemmatize("casa", "V"));
+ assertEquals("casa", dict.lemmatize("casa", "NOUN"));
+
+ assertEquals("casa", dict.lemmatize("Casa", "PROP"));
+
+ }
+
+ private MorfologikLemmatizer createDictionary(boolean caseSensitive)
+ throws Exception {
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI()
+ .toURL());
+
+ return ml;
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java (added)
+++ opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java Mon Dec 2 13:23:04 2013
@@ -0,0 +1,92 @@
+package opennlp.morfologik.tagdict;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import morfologik.stemming.Dictionary;
+import opennlp.morfologik.builder.MorfologikDictionayBuilder;
+import opennlp.morfologik.builder.POSDictionayBuilderTest;
+import opennlp.morfologik.tagdict.MorfologikTagDictionary;
+import opennlp.tools.postag.TagDictionary;
+
+import org.junit.Test;
+
+public class MorfologikTagDictionaryTest {
+
+ @Test
+ public void testNoLemma() throws Exception {
+ MorfologikTagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("carro"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("NOUN"));
+
+ }
+
+ @Test
+ public void testPOSDictionaryInsensitive() throws Exception {
+ TagDictionary dict = createDictionary(false);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ }
+
+ @Test
+ public void testPOSDictionarySensitive() throws Exception {
+ TagDictionary dict = createDictionary(true);
+
+ List<String> tags = Arrays.asList(dict.getTags("casa"));
+ assertEquals(2, tags.size());
+ assertTrue(tags.contains("NOUN"));
+ assertTrue(tags.contains("V"));
+
+ // this is the behavior of case insensitive dictionary
+ // if we search it using case insensitive, Casa as a proper noun
+ // should be lower case in the dictionary
+ tags = Arrays.asList(dict.getTags("Casa"));
+ assertEquals(1, tags.size());
+ assertTrue(tags.contains("PROP"));
+
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive)
+ throws Exception {
+ return this.createDictionary(caseSensitive, null);
+ }
+
+ private MorfologikTagDictionary createDictionary(boolean caseSensitive,
+ List<String> constant) throws Exception {
+
+ MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder();
+ File dictInFile = new File(POSDictionayBuilderTest.class.getResource(
+ "/dictionaryWithLemma.txt").getFile());
+
+ File dictOutFile = File.createTempFile(
+ POSDictionayBuilderTest.class.getName(), ".dict");
+
+ builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", true,
+ true);
+
+ MorfologikTagDictionary ml = new MorfologikTagDictionary(
+ Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive);
+
+ return ml;
+ }
+
+}
Propchange: opennlp/addons/morfologik-addon/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt
URL: http://svn.apache.org/viewvc/opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt?rev=1547014&view=auto
==============================================================================
--- opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt (added)
+++ opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt Mon Dec 2 13:23:04 2013
@@ -0,0 +1,10 @@
+casa casa NOUN
+casa casar V
+Casa Casa PROP
+casinha casa NOUN
+casona casa NOUN
+menina menino NOUN
+menino menino NOUN
+meninão menino NOUN
+menininho menino NOUN
+carro NOUN
Propchange: opennlp/addons/morfologik-addon/src/test/resources/dictionaryWithLemma.txt
------------------------------------------------------------------------------
svn:mime-type = text/plain