You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/31 11:02:02 UTC
svn commit: r1129577 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
Author: joern
Date: Tue May 31 09:02:02 2011
New Revision: 1129577
URL: http://svn.apache.org/viewvc?rev=1129577&view=rev
Log:
OPENNLP-187 Added ngram training support
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java?rev=1129577&r1=1129576&r2=1129577&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java Tue May 31 09:02:02 2011
@@ -27,9 +27,11 @@ import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.postag.POSDictionary;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.postag.WordTagSampleStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -90,6 +92,24 @@ public final class POSTaggerTrainerTool
ObjectStream<POSSample> sampleStream = openSampleData("Training", trainingDataInFile,
parameters.getEncoding());
+
+ Dictionary ngramDict = null;
+
+ String ngramCutoffString = CmdLineUtil.getParameter("-ngram", args);
+
+ if (ngramCutoffString != null) {
+ System.err.print("Building ngram dictionary ... ");
+ int ngramCutoff = Integer.parseInt(ngramCutoffString);
+ try {
+ ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff);
+ sampleStream.reset();
+ } catch (IOException e) {
+ CmdLineUtil.printTrainingIoError(e);
+ throw new TerminateToolException(-1);
+ }
+ System.err.println("done");
+ }
+
POSModel model;
try {
@@ -103,11 +123,11 @@ public final class POSTaggerTrainerTool
if (mlParams == null) {
// depending on model and sequence choose training method
model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
- sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+ sampleStream, parameters.getModel(), tagdict, ngramDict, parameters.getCutoff(), parameters.getNumberOfIterations());
}
else {
model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
- sampleStream, mlParams, tagdict, null);
+ sampleStream, mlParams, tagdict, ngramDict);
}
}
catch (IOException e) {