You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/31 11:02:02 UTC

svn commit: r1129577 - /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java

Author: joern
Date: Tue May 31 09:02:02 2011
New Revision: 1129577

URL: http://svn.apache.org/viewvc?rev=1129577&view=rev
Log:
OPENNLP-187 Added ngram training support

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java?rev=1129577&r1=1129576&r2=1129577&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java Tue May 31 09:02:02 2011
@@ -27,9 +27,11 @@ import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.postag.POSDictionary;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.postag.WordTagSampleStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -90,6 +92,24 @@ public final class POSTaggerTrainerTool 
     ObjectStream<POSSample> sampleStream = openSampleData("Training", trainingDataInFile, 
         parameters.getEncoding());
     
+    
+    Dictionary ngramDict = null;
+    
+    String ngramCutoffString = CmdLineUtil.getParameter("-ngram", args);
+    
+    if (ngramCutoffString != null) {
+      System.err.print("Building ngram dictionary ... ");
+      int ngramCutoff = Integer.parseInt(ngramCutoffString);
+      try {
+        ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff);
+        sampleStream.reset();
+      } catch (IOException e) {
+        CmdLineUtil.printTrainingIoError(e);
+        throw new TerminateToolException(-1);
+      }
+      System.err.println("done");
+    }
+    
     POSModel model;
     try {
       
@@ -103,11 +123,11 @@ public final class POSTaggerTrainerTool 
       if (mlParams == null) {
         // depending on model and sequence choose training method
         model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
-             sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+             sampleStream, parameters.getModel(), tagdict, ngramDict, parameters.getCutoff(), parameters.getNumberOfIterations());
       }
       else {
         model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
-            sampleStream, mlParams, tagdict, null);
+            sampleStream, mlParams, tagdict, ngramDict);
       }
     }
     catch (IOException e) {