You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/11/09 21:11:01 UTC
[07/16] opennlp git commit: OPENNLP-622 Fixed issues related to
command line.
OPENNLP-622 Fixed issues related to command line.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d1fab8cd
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d1fab8cd
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d1fab8cd
Branch: refs/heads/trunk
Commit: d1fab8cd4215ddf65ce98ef6aae2bc06720be742
Parents: f588858
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 19:18:54 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 19:18:54 2016 +0000
----------------------------------------------------------------------
.../builder/XMLDictionaryToTableParams.java | 11 ++++-
.../builder/XMLDictionaryToTableTool.java | 51 ++++++++++++++++++--
.../tagdict/MorfologikPOSTaggerFactory.java | 26 ----------
.../tagdict/POSTaggerFactoryTest.java | 6 ++-
4 files changed, 63 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
index b88cc5d..4ee8cd4 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
import java.io.File;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.params.EncodingParameter;
@@ -30,7 +31,15 @@ interface XMLDictionaryToTableParams extends EncodingParameter {
@ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
File getInputFile();
- @ParameterDescription(valueName = "out", description = "Tab separated format.")
+ @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
File getOutputFile();
+ @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+ @OptionalParameter(defaultValue=",")
+ String getSeparator();
+
+ @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+ @OptionalParameter(defaultValue="prefix")
+ String getEncoder();
+
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index c87f016..0e7f2d5 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -23,8 +23,11 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.Iterator;
+import java.util.Properties;
+import morfologik.stemming.DictionaryMetadata;
import opennlp.tools.cmdline.BasicCmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
@@ -35,6 +38,8 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
interface Params extends XMLDictionaryToTableParams {
}
+ private String SEPARATOR;
+
public String getShortDescription() {
return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
}
@@ -49,6 +54,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
File dictInFile = params.getInputFile();
File dictOutFile = params.getOutputFile();
Charset encoding = params.getEncoding();
+ SEPARATOR = params.getSeparator();
CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
@@ -66,17 +72,56 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
encoding)) {
while (iterator.hasNext()) {
String word = iterator.next();
- String wordAndLemma = word + "\t\t"; // lemma is empty
for (String tag : tagDictionary.getTags(word)) {
- writer.write(wordAndLemma + tag);
- writer.newLine();
+ if(valid(word,tag)) {
+ String entry = createEntry(word, tag);
+ writer.write(entry);
+ writer.newLine();
+ }
}
}
writer.close();
+ System.out.println("Created dictionary: " + dictOutFile.toPath());
} catch (IOException e) {
throw new TerminateToolException(-1, "Error while writing output: "
+ e.getMessage(), e);
}
+
+ Properties info = new Properties();
+ info.setProperty("fsa.dict.separator", SEPARATOR);
+ info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+ info.setProperty("fsa.dict.encoder", params.getEncoder());
+
+ Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+
+ try {
+ info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Error while writing metadata output: "
+ + e.getMessage(), e);
+ }
+ System.out.println("Created metadata: " + dictOutFile.toPath());
+
+ }
+
+ private boolean valid(String word, String tag) {
+ if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+ System.out
+ .println("Warn: invalid entry because contains separator - word: "
+ + word + " tag: " + tag);
+ return false;
+ }
+
+ return true;
+ }
+
+ private String createEntry(String word, String tag) {
+
+ String entry = "" + SEPARATOR +// base
+ word + SEPARATOR +
+ tag;
+
+ return entry;
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index dcb6554..93d6c61 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,8 +17,6 @@
package opennlp.morfologik.tagdict;
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
@@ -27,7 +25,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.Map;
import morfologik.stemming.DictionaryMetadata;
@@ -81,29 +78,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
super.init(ngramDictionary, null);
this.dict = posDictionary;
-
- // get the dictionary path
- String path = System.getProperty("morfologik.dict");
- if (path == null) {
- throw new IllegalArgumentException(
- "The property fsa.dict is missing! -Dmorfologik.dict=path");
- }
-
- // now we try to load it...
- try {
- this.dictData = Files.readAllBytes(Paths.get(path));
- this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
- .toPath());
-
- this.dict = createMorfologikDictionary(dictData, dictInfo);
-
- } catch (IllegalArgumentException e) {
- throw new IllegalArgumentException(
- "The file is not a Morfologik dictionary!", e);
- } catch (IOException e) {
- throw new IllegalArgumentException(
- "Could not open the Morfologik dictionary or the .info file", e);
- }
}
@Override
http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 9233979..7341a02 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,7 +17,7 @@
package opennlp.morfologik.tagdict;
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -71,6 +71,8 @@ public class POSTaggerFactoryTest {
POSTaggerFactory factory = posModel.getFactory();
assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+ factory = null;
+
ByteArrayOutputStream out = new ByteArrayOutputStream();
posModel.serialize(out);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
@@ -79,6 +81,8 @@ public class POSTaggerFactoryTest {
factory = fromSerialized.getFactory();
assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+
+ assertEquals(2, factory.getTagDictionary().getTags("casa").length);
}
}
\ No newline at end of file