You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2016/11/09 21:11:01 UTC

[07/16] opennlp git commit: OPENNLP-622 Fixed issues related to command line.

OPENNLP-622 Fixed issues related to command line.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d1fab8cd
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d1fab8cd
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d1fab8cd

Branch: refs/heads/trunk
Commit: d1fab8cd4215ddf65ce98ef6aae2bc06720be742
Parents: f588858
Author: William Colen <co...@apache.org>
Authored: Fri Jul 8 19:18:54 2016 +0000
Committer: William Colen <co...@apache.org>
Committed: Fri Jul 8 19:18:54 2016 +0000

----------------------------------------------------------------------
 .../builder/XMLDictionaryToTableParams.java     | 11 ++++-
 .../builder/XMLDictionaryToTableTool.java       | 51 ++++++++++++++++++--
 .../tagdict/MorfologikPOSTaggerFactory.java     | 26 ----------
 .../tagdict/POSTaggerFactoryTest.java           |  6 ++-
 4 files changed, 63 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
index b88cc5d..4ee8cd4 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableParams.java
@@ -19,6 +19,7 @@ package opennlp.morfologik.cmdline.builder;
 
 import java.io.File;
 
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.EncodingParameter;
 
@@ -30,7 +31,15 @@ interface XMLDictionaryToTableParams extends EncodingParameter {
   @ParameterDescription(valueName = "in", description = "OpenNLP XML Tag Dictionary.")
   File getInputFile();
 
-  @ParameterDescription(valueName = "out", description = "Tab separated format.")
+  @ParameterDescription(valueName = "out", description = "Output for Morfologik (.info will be also created).")
   File getOutputFile();
 
+  @ParameterDescription(valueName = "char", description = "Columm separator (must be a single character)")
+  @OptionalParameter(defaultValue=",")
+  String getSeparator();
+  
+  @ParameterDescription(valueName = "value", description = " Type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none].")
+  @OptionalParameter(defaultValue="prefix")
+  String getEncoder();
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
index c87f016..0e7f2d5 100644
--- a/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
+++ b/src/main/java/opennlp/morfologik/cmdline/builder/XMLDictionaryToTableTool.java
@@ -23,8 +23,11 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.Iterator;
+import java.util.Properties;
 
+import morfologik.stemming.DictionaryMetadata;
 import opennlp.tools.cmdline.BasicCmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
@@ -35,6 +38,8 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
   interface Params extends XMLDictionaryToTableParams {
   }
 
+  private String SEPARATOR;
+
   public String getShortDescription() {
     return "reads an OpenNLP XML tag dictionary and outputs it in a tab separated file";
   }
@@ -49,6 +54,7 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
     File dictInFile = params.getInputFile();
     File dictOutFile = params.getOutputFile();
     Charset encoding = params.getEncoding();
+    SEPARATOR = params.getSeparator();
 
     CmdLineUtil.checkInputFile("dictionary input file", dictInFile);
     CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile);
@@ -66,17 +72,56 @@ public class XMLDictionaryToTableTool extends BasicCmdLineTool {
         encoding)) {
       while (iterator.hasNext()) {
         String word = iterator.next();
-        String wordAndLemma = word + "\t\t"; // lemma is empty
         for (String tag : tagDictionary.getTags(word)) {
-          writer.write(wordAndLemma + tag);
-          writer.newLine();
+          if(valid(word,tag)) {
+            String entry = createEntry(word, tag);
+            writer.write(entry);
+            writer.newLine();
+          }
         }
       }
       writer.close();
+      System.out.println("Created dictionary: " + dictOutFile.toPath());
     } catch (IOException e) {
       throw new TerminateToolException(-1, "Error while writing output: "
           + e.getMessage(), e);
     }
+    
+    Properties info = new Properties();
+    info.setProperty("fsa.dict.separator", SEPARATOR);
+    info.setProperty("fsa.dict.encoding", params.getEncoding().name());
+    info.setProperty("fsa.dict.encoder", params.getEncoder());
+    
+    Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictOutFile.toPath());
+    
+    try {
+      info.store(Files.newOutputStream(metaPath), "Info file for FSA Morfologik dictionary.");
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "Error while writing metadata output: "
+          + e.getMessage(), e);
+    }
+    System.out.println("Created metadata: " + dictOutFile.toPath());
+    
+  }
+
+  private boolean valid(String word, String tag) {
+    if(word.contains(SEPARATOR) || tag.contains(SEPARATOR)) {
+      System.out
+          .println("Warn: invalid entry because contains separator - word: "
+              + word + " tag: " + tag);
+      return false;
+    }
+    
+    return true;
+  }
+
+  private String createEntry(String word, String tag) {
+    
+    String entry = "" + SEPARATOR +// base
+        word + SEPARATOR +
+        tag;
+        
+    return entry;
   }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
index dcb6554..93d6c61 100644
--- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
+++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java
@@ -17,8 +17,6 @@
 
 package opennlp.morfologik.tagdict;
 
-import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile;
-
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
@@ -27,7 +25,6 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.Map;
 
 import morfologik.stemming.DictionaryMetadata;
@@ -81,29 +78,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory {
   protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) {
     super.init(ngramDictionary, null);
     this.dict = posDictionary;
-
-    // get the dictionary path
-    String path = System.getProperty("morfologik.dict");
-    if (path == null) {
-      throw new IllegalArgumentException(
-          "The property fsa.dict is missing! -Dmorfologik.dict=path");
-    }
-
-    // now we try to load it...
-    try {
-      this.dictData = Files.readAllBytes(Paths.get(path));
-      this.dictInfo = Files.readAllBytes(getExpectedPropertiesFile(path)
-          .toPath());
-
-      this.dict = createMorfologikDictionary(dictData, dictInfo);
-
-    } catch (IllegalArgumentException e) {
-      throw new IllegalArgumentException(
-          "The file is not a Morfologik dictionary!", e);
-    } catch (IOException e) {
-      throw new IllegalArgumentException(
-          "Could not open the Morfologik dictionary or the .info file", e);
-    }
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d1fab8cd/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
index 9233979..7341a02 100644
--- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
+++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java
@@ -17,7 +17,7 @@
 
 package opennlp.morfologik.tagdict;
 
-import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.*;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -71,6 +71,8 @@ public class POSTaggerFactoryTest {
     POSTaggerFactory factory = posModel.getFactory();
     assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
 
+    factory = null;
+    
     ByteArrayOutputStream out = new ByteArrayOutputStream();
     posModel.serialize(out);
     ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
@@ -79,6 +81,8 @@ public class POSTaggerFactoryTest {
 
     factory = fromSerialized.getFactory();
     assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary);
+    
+    assertEquals(2, factory.getTagDictionary().getTags("casa").length);
   }
 
 }
\ No newline at end of file