You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/07/18 22:29:47 UTC

svn commit: r1148042 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag: POSTaggerCrossValidatorTool.java POSTaggerTrainerTool.java TrainingParameters.java TrainingParametersI.java TrainingParams.java

Author: colen
Date: Mon Jul 18 20:29:46 2011
New Revision: 1148042

URL: http://svn.apache.org/viewvc?rev=1148042&view=rev
Log:
OPENNLP-227 Updated POS Tagger trainer and cv tools

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java   (contents, props changed)
      - copied, changed from r1147973, incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParametersI.java
Removed:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParameters.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParametersI.java
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java?rev=1148042&r1=1148041&r2=1148042&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java Mon Jul 18 20:29:46 2011
@@ -22,8 +22,8 @@ import java.io.FileInputStream;
 import java.io.IOException;
 
 import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.CVParams;
 import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CVParams;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
@@ -31,11 +31,10 @@ import opennlp.tools.postag.POSDictionar
 import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerCrossValidator;
 import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.model.ModelType;
 
 public final class POSTaggerCrossValidatorTool implements CmdLineTool {
   
-  interface Parameters extends CVParams, TrainingParametersI {
+  interface CVToolParams extends CVParams, TrainingParams {
     
   }
 
@@ -49,16 +48,16 @@ public final class POSTaggerCrossValidat
 
   public String getHelp() {
     return "Usage: " + CLI.CMD + " " + getName() + " "
-        + ArgumentParser.createUsage(Parameters.class);
+        + ArgumentParser.createUsage(CVToolParams.class);
   }
 
   public void run(String[] args) {
-    if (!ArgumentParser.validateArguments(args, Parameters.class)) {
+    if (!ArgumentParser.validateArguments(args, CVToolParams.class)) {
       System.err.println(getHelp());
       throw new TerminateToolException(1);
     }
     
-    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    CVToolParams params = ArgumentParser.parse(args, CVToolParams.class);
 
     opennlp.tools.util.TrainingParameters mlParams = CmdLineUtil
         .loadTrainingParameters(params.getParams(), false);
@@ -79,7 +78,7 @@ public final class POSTaggerCrossValidat
 
       if (mlParams == null) {
         validator = new POSTaggerCrossValidator(params.getLang(),
-            getModelType(params.getType()), tagdict, null, params.getCutoff(),
+            POSTaggerTrainerTool.getModelType(params.getType()), tagdict, null, params.getCutoff(),
             params.getIterations());
       } else {
         validator = new POSTaggerCrossValidator(params.getLang(),
@@ -104,24 +103,4 @@ public final class POSTaggerCrossValidat
 
     System.out.println("Accuracy: " + validator.getWordAccuracy());
   }
-
-  private ModelType getModelType(String modelString) {
-    ModelType model;
-    if (modelString == null)
-      modelString = "maxent";
-    
-    if (modelString.equals("maxent")) {
-      model = ModelType.MAXENT; 
-    }
-    else if (modelString.equals("perceptron")) {
-      model = ModelType.PERCEPTRON; 
-    }
-    else if (modelString.equals("perceptron_sequence")) {
-      model = ModelType.PERCEPTRON_SEQUENCE; 
-    }
-    else {
-      model = null;
-    }
-    return model;
-  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java?rev=1148042&r1=1148041&r2=1148042&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java Mon Jul 18 20:29:46 2011
@@ -23,10 +23,12 @@ import java.io.IOException;
 import java.nio.charset.Charset;
 
 import opennlp.model.TrainUtil;
+import opennlp.tools.cmdline.ArgumentParser;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.TrainingToolParams;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.postag.POSDictionary;
 import opennlp.tools.postag.POSModel;
@@ -35,8 +37,13 @@ import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.postag.WordTagSampleStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.model.ModelType;
 
 public final class POSTaggerTrainerTool implements CmdLineTool {
+  
+  interface TrainerToolParams extends TrainingParams, TrainingToolParams{
+
+  }
 
   public String getName() {
     return "POSTaggerTrainer";
@@ -47,9 +54,8 @@ public final class POSTaggerTrainerTool 
   }
   
   public String getHelp() {
-    return "Usage: " + CLI.CMD + " " + getName() + " " + TrainingParameters.getParameterUsage() 
-        + " -data trainingData -model model\n" +
-        TrainingParameters.getDescription();
+    return "Usage: " + CLI.CMD + " " + getName() + " "
+      + ArgumentParser.createUsage(TrainerToolParams.class);
   }
 
   static ObjectStream<POSSample> openSampleData(String sampleDataName,
@@ -65,41 +71,36 @@ public final class POSTaggerTrainerTool 
   }
   
   public void run(String[] args) {
-    if (args.length < 8) {
-      System.out.println(getHelp());
+    if (!ArgumentParser.validateArguments(args, TrainerToolParams.class)) {
+      System.err.println(getHelp());
       throw new TerminateToolException(1);
     }
     
-    TrainingParameters parameters = new TrainingParameters(args);
-    
-    if(!parameters.isValid()) {
-      System.out.println(getHelp());
-      throw new TerminateToolException(1);
-    }    
+    TrainerToolParams params = ArgumentParser.parse(args,
+        TrainerToolParams.class);    
     
     opennlp.tools.util.TrainingParameters mlParams = 
-      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
+      CmdLineUtil.loadTrainingParameters(params.getParams(), true);
     
     if (mlParams != null && !TrainUtil.isValid(mlParams.getSettings())) {
       System.err.println("Training parameters file is invalid!");
       throw new TerminateToolException(-1);
     }
     
-    File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
-    File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
+    File trainingDataInFile = params.getData();
+    File modelOutFile = params.getModel();
     
     CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);
     ObjectStream<POSSample> sampleStream = openSampleData("Training", trainingDataInFile, 
-        parameters.getEncoding());
+        params.getEncoding());
     
     
     Dictionary ngramDict = null;
     
-    String ngramCutoffString = CmdLineUtil.getParameter("-ngram", args);
+    Integer ngramCutoff = params.getNgram();
     
-    if (ngramCutoffString != null) {
+    if (ngramCutoff != null) {
       System.err.print("Building ngram dictionary ... ");
-      int ngramCutoff = Integer.parseInt(ngramCutoffString);
       try {
         ngramDict = POSTaggerME.buildNGramDictionary(sampleStream, ngramCutoff);
         sampleStream.reset();
@@ -115,18 +116,17 @@ public final class POSTaggerTrainerTool 
       
       // TODO: Move to util method ...
       POSDictionary tagdict = null;
-      if (parameters.getDictionaryPath() != null) {
-        // TODO: Should re-factored as described in OPENNLP-193
-        tagdict = new POSDictionary(parameters.getDictionaryPath());
+      if (params.getDict() != null) {
+        tagdict = POSDictionary.create(new FileInputStream(params.getDict()));
       }
       
       if (mlParams == null) {
         // depending on model and sequence choose training method
-        model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
-             sampleStream, parameters.getModel(), tagdict, ngramDict, parameters.getCutoff(), parameters.getNumberOfIterations());
+        model = opennlp.tools.postag.POSTaggerME.train(params.getLang(),
+             sampleStream, getModelType(params.getType()), tagdict, ngramDict, params.getCutoff(), params.getIterations());
       }
       else {
-        model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
+        model = opennlp.tools.postag.POSTaggerME.train(params.getLang(),
             sampleStream, mlParams, tagdict, ngramDict);
       }
     }
@@ -144,4 +144,24 @@ public final class POSTaggerTrainerTool 
     
     CmdLineUtil.writeModel("pos tagger", modelOutFile, model);
   }
+  
+  static ModelType getModelType(String modelString) {
+    ModelType model;
+    if (modelString == null)
+      modelString = "maxent";
+    
+    if (modelString.equals("maxent")) {
+      model = ModelType.MAXENT; 
+    }
+    else if (modelString.equals("perceptron")) {
+      model = ModelType.PERCEPTRON; 
+    }
+    else if (modelString.equals("perceptron_sequence")) {
+      model = ModelType.PERCEPTRON_SEQUENCE; 
+    }
+    else {
+      model = null;
+    }
+    return model;
+  }
 }

Copied: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java (from r1147973, incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParametersI.java)
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java?p2=incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java&p1=incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParametersI.java&r1=1147973&r2=1148042&rev=1148042&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParametersI.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java Mon Jul 18 20:29:46 2011
@@ -28,13 +28,17 @@ import opennlp.tools.cmdline.BasicTraini
  * 
  * Note: Do not use this class, internal use only!
  */
-interface TrainingParametersI extends BasicTrainingParams {
+interface TrainingParams extends BasicTrainingParams {
   
-  @ParameterDescription(valueName = "modelType", description = "The type of the token name finder model. One of axent|perceptron|perceptron_sequence.")
+  @ParameterDescription(valueName = "maxent|perceptron|perceptron_sequence", description = "The type of the token name finder model. One of maxent|perceptron|perceptron_sequence.")
   @OptionalParameter(defaultValue = "maxent")
   String getType();
   
   @ParameterDescription(valueName = "dictionaryPath", description = "The feature generator descriptor file")
   @OptionalParameter
-  File getDict();  
+  File getDict();
+  
+  @ParameterDescription(valueName = "cutoff", description = "NGram cutoff. If not specified will not create ngram dictionary.")
+  @OptionalParameter
+  Integer getNgram();
 }

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/TrainingParams.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain