You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/18 20:34:55 UTC
svn commit: r1124372 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: chunker/ cmdline/ cmdline/chunker/ cmdline/doccat/ cmdline/namefind/ cmdline/parser/ cmdline/postag/ cmdline/sentdetect/ cmdline/tokenizer/ doccat/ namefind/ ...

Author: joern
Date: Wed May 18 18:34:54 2011
New Revision: 1124372

URL: http://svn.apache.org/viewvc?rev=1124372&view=rev
Log:
OPENNLP-175 Updated cmd line interface and added train methods to train with trainig parameters file/object

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java Wed May 18 18:34:54 2011
@@ -30,6 +30,7 @@ import java.util.Map;
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.util.BeamSearch;
 import opennlp.tools.util.HashSumEventStream;
@@ -38,6 +39,7 @@ import opennlp.tools.util.PlainTextByLin
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -206,6 +208,24 @@ public class ChunkerME implements Chunke
   }
 
   public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in, 
+      ChunkerContextGenerator contextGenerator, TrainingParameters mlParams)
+  throws IOException {
+    
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+//    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    
+    EventStream es = new ChunkerEventStream(in, contextGenerator);
+    HashSumEventStream hses = new HashSumEventStream(es);
+    
+    AbstractModel maxentModel = TrainUtil.train(hses, mlParams.getSettings());
+    
+    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
+        hses.calculateHashSum().toString(16));
+    
+    return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
+  }
+  
+  public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in, 
       int cutoff, int iterations, ChunkerContextGenerator contextGenerator)
       throws IOException {
     

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java Wed May 18 18:34:54 2011
@@ -23,6 +23,7 @@ import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.IllegalCharsetNameException;
@@ -31,6 +32,8 @@ import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 
+import opennlp.model.TrainUtil;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
 
 /**
@@ -331,4 +334,34 @@ public final class CmdLineUtil {
     System.err.println("IO Error while reading from stdin: " + e.getMessage());
     throw new TerminateToolException(-1);
   }
+  
+  // its optional, passing null is allowed
+  public static TrainingParameters loadTrainingParameters(String paramFile) {
+    
+    TrainingParameters params = null;
+    
+    if (paramFile != null) {
+      
+      checkInputFile("Training Parameter", new File(paramFile));
+      
+      InputStream paramsIn = null;
+      try {
+        paramsIn = new FileInputStream(new File(paramFile));
+        
+        params = new opennlp.tools.util.TrainingParameters(paramsIn);
+      } catch (IOException e) {
+        // TODO: print error and exit
+        e.printStackTrace();
+      }
+      finally {
+        try {
+          if (paramsIn != null)
+            paramsIn.close();
+        } catch (IOException e) {
+        }
+      }
+    }
+    
+    return params;
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,10 +22,12 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.chunker.ChunkSample;
 import opennlp.tools.chunker.ChunkSampleStream;
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.chunker.DefaultChunkerContextGenerator;
 import opennlp.tools.cmdline.BasicTrainingParameters;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
@@ -76,6 +78,21 @@ public class ChunkerTrainerTool implemen
       throw new TerminateToolException(1);
     }
     
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
+    
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
 
@@ -85,8 +102,14 @@ public class ChunkerTrainerTool implemen
     
     ChunkerModel model;
     try {
-      model = ChunkerME.train(parameters.getLanguage(), sampleStream, 
-          parameters.getCutoff(), parameters.getNumberOfIterations());
+      if (mlParams == null) {
+        model = ChunkerME.train(parameters.getLanguage(), sampleStream, 
+            parameters.getCutoff(), parameters.getNumberOfIterations());
+      }
+      else {
+        model = ChunkerME.train(parameters.getLanguage(), sampleStream, 
+            new DefaultChunkerContextGenerator(), mlParams);
+      }
     } catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);
       throw new TerminateToolException(-1);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.BasicTrainingParameters;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
@@ -75,6 +76,21 @@ public class DoccatTrainerTool implement
       throw new TerminateToolException(1);
     }
     
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
+    
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
 
@@ -84,8 +100,14 @@ public class DoccatTrainerTool implement
     
     DoccatModel model;
     try {
-      model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream, 
+      if (mlParams == null) {
+       model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream, 
           parameters.getCutoff(), parameters.getNumberOfIterations());
+      }
+      else {
+        model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream, 
+            mlParams);
+      }
     } catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);
       throw new TerminateToolException(-1);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Wed May 18 18:34:54 2011
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.nio.charset.Charset;
 import java.util.Collections;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -75,18 +76,39 @@ public final class TokenNameFinderTraine
       throw new TerminateToolException(1);
     }
     
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
+    
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-
+    
     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
     ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
         parameters.getEncoding());
 
     TokenNameFinderModel model;
     try {
+      if (mlParams == null) {
       model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(),
            sampleStream, Collections.<String, Object>emptyMap(),
            parameters.getNumberOfIterations(), parameters.getCutoff());
+      }
+      else {
+        model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(), sampleStream, mlParams, null,
+            Collections.<String, Object>emptyMap());
+      }
     } 
     catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java Wed May 18 18:34:54 2011
@@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -107,6 +108,21 @@ public final class ParserTrainerTool imp
       throw new TerminateToolException(1);
     } 
     
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
+    
     ObjectStream<Parse> sampleStream = openTrainingData(new File(CmdLineUtil.getParameter("-data", args)), parameters.getEncoding());
     
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -119,19 +135,35 @@ public final class ParserTrainerTool imp
           new InputStreamReader(new FileInputStream(new File(CmdLineUtil.getParameter("-head-rules", args))), 
           parameters.getEncoding()));
       
-      if (ParserType.CHUNKING.equals(parameters.getParserType())) {
-        model = opennlp.tools.parser.chunking.Parser.train(
-            parameters.getLanguage(), sampleStream, rules, 
-            parameters.getNumberOfIterations(), parameters.getCutoff());
-      }
-      else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
-        model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules, parameters.getNumberOfIterations(), 
-            parameters.getCutoff());
+      if (mlParams == null) {
+        if (ParserType.CHUNKING.equals(parameters.getParserType())) {
+          model = opennlp.tools.parser.chunking.Parser.train(
+              parameters.getLanguage(), sampleStream, rules, 
+              parameters.getNumberOfIterations(), parameters.getCutoff());
+        }
+        else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
+          model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules, parameters.getNumberOfIterations(), 
+              parameters.getCutoff());
+        }
+        else {
+          throw new IllegalStateException();
+        }
       }
       else {
-        throw new IllegalStateException();
+        if (ParserType.CHUNKING.equals(parameters.getParserType())) {
+          model = opennlp.tools.parser.chunking.Parser.train(
+              parameters.getLanguage(), sampleStream, rules, 
+              mlParams);
+        }
+        else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
+          model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules,
+              mlParams);
+        }
+        else {
+          throw new IllegalStateException();
+        }
+
       }
-      
     }
     catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -74,6 +75,14 @@ public final class POSTaggerTrainerTool 
       throw new TerminateToolException(1);
     }    
     
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null && !TrainUtil.isValid(mlParams.getSettings())) {
+      System.err.println("Training parameters file is invalid!");
+      throw new TerminateToolException(-1);
+    }
+    
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
     
@@ -90,9 +99,15 @@ public final class POSTaggerTrainerTool 
         tagdict = new POSDictionary(parameters.getDictionaryPath());
       }
       
-      // depending on model and sequence choose training method
-      model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
-           sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+      if (mlParams == null) {
+        // depending on model and sequence choose training method
+        model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
+             sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+      }
+      else {
+        model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
+            sampleStream, mlParams, tagdict, null);
+      }
     }
     catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -73,6 +74,21 @@ public final class SentenceDetectorTrain
       System.out.println(getHelp());
       throw new TerminateToolException(1);
     }
+
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
     
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -80,11 +96,17 @@ public final class SentenceDetectorTrain
     CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile);
     ObjectStream<SentenceSample> sampleStream = 
         openSampleData("Training", trainingDataInFile, parameters.getEncoding());
-    
+
     SentenceModel model;
     try {
-      model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null, 
-          parameters.getCutoff(), parameters.getNumberOfIterations());
+      if (mlParams == null) {
+        model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null, 
+            parameters.getCutoff(), parameters.getNumberOfIterations());
+      }
+      else {
+        model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null, 
+            mlParams);
+      }
     } catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);
       throw new TerminateToolException(-1);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.Charset;
 
+import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
@@ -73,19 +74,42 @@ public final class TokenizerTrainerTool 
       throw new TerminateToolException(1);
     }
 
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+    
+    if (mlParams != null) {
+      if (!TrainUtil.isValid(mlParams.getSettings())) {
+        System.err.println("Training parameters file is invalid!");
+        throw new TerminateToolException(-1);
+      }
+      
+      if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+        System.err.println("Sequence training is not supported!");
+        throw new TerminateToolException(-1);
+      }
+    }
+    
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-
+    
     CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile);
     ObjectStream<TokenSample> sampleStream = openSampleData("Training",
         trainingDataInFile, parameters.getEncoding());
 
     TokenizerModel model;
     try {
-      model = opennlp.tools.tokenize.TokenizerME.train(
-          parameters.getLanguage(), sampleStream, 
-          parameters.isAlphaNumericOptimizationEnabled(),
-          parameters.getCutoff(), parameters.getNumberOfIterations());
+      if (mlParams == null) {
+        model = opennlp.tools.tokenize.TokenizerME.train(
+            parameters.getLanguage(), sampleStream, 
+            parameters.isAlphaNumericOptimizationEnabled(),
+            parameters.getCutoff(), parameters.getNumberOfIterations());
+      }
+      else {
+        model = opennlp.tools.tokenize.TokenizerME.train(
+            parameters.getLanguage(), sampleStream, 
+            parameters.isAlphaNumericOptimizationEnabled(),
+            mlParams);
+      }
     } catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);
       throw new TerminateToolException(-1);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java Wed May 18 18:34:54 2011
@@ -26,10 +26,12 @@ import java.util.Map;
 import opennlp.maxent.GIS;
 import opennlp.model.AbstractModel;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 
 /**
@@ -142,6 +144,21 @@ public class DocumentCategorizerME imple
     return GIS.trainModel(100, new TwoPassDataIndexer(eventStream, 5));
   }
   
+   
+   public static DoccatModel train(String languageCode, ObjectStream<DocumentSample> samples,
+       TrainingParameters mlParams, FeatureGenerator... featureGenerators)
+   throws IOException {
+     
+     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+//     ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+     
+     AbstractModel model = TrainUtil.train(
+         new DocumentCategorizerEventStream(samples, featureGenerators),
+         mlParams.getSettings());
+       
+     return new DoccatModel(languageCode, model, manifestInfoEntries);
+   }
+   
   /**
    * Trains a document categorizer model with custom feature generation.
    * 

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Wed May 18 18:34:54 2011
@@ -37,6 +37,7 @@ import opennlp.maxent.GISModel;
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.util.BeamSearch;
 import opennlp.tools.util.HashSumEventStream;
@@ -45,6 +46,7 @@ import opennlp.tools.util.PlainTextByLin
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
 import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
 import opennlp.tools.util.featuregen.CachedFeatureGenerator;
@@ -311,7 +313,38 @@ public class NameFinderME implements Tok
      return sprobs;
    }
 
-
+   public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples, 
+       TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources) throws IOException {
+     
+     if (TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+       throw new IllegalArgumentException("Sequence training is not supported!");
+     }
+     
+     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+//     ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+     
+     AdaptiveFeatureGenerator featureGenerator;
+     
+     if (generator != null)
+       featureGenerator = generator;
+     else 
+       featureGenerator = createFeatureGenerator();
+     
+     EventStream eventStream = new NameFinderEventStream(samples, type,
+         new DefaultNameContextGenerator(featureGenerator));
+     HashSumEventStream hses = new HashSumEventStream(eventStream);
+     
+     AbstractModel nameFinderModel = TrainUtil.train(hses, trainParams.getSettings());
+     
+//     AbstractModel nameFinderModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
+     
+     manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
+         hses.calculateHashSum().toString(16));
+     
+     return new TokenNameFinderModel(languageCode, nameFinderModel,
+         resources, manifestInfoEntries);
+   }
+   
    /**
     * Trains a name finder model.
     * 

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java Wed May 18 18:34:54 2011
@@ -30,6 +30,7 @@ import java.util.Map;
 
 import opennlp.model.AbstractModel;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.chunker.ChunkSample;
 import opennlp.tools.chunker.Chunker;
@@ -56,6 +57,7 @@ import opennlp.tools.util.InvalidFormatE
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelType;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -275,6 +277,56 @@ public class Parser extends AbstractBott
     return opennlp.maxent.GIS.trainModel(iterations, new TwoPassDataIndexer(es, cut));
   }
 
+  public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
+  throws IOException {
+    
+    System.err.println("Building dictionary");
+ // TODO: Discuss and make dict cutoff configurable
+    Dictionary mdict = buildDictionary(parseSamples, rules, 5); 
+    
+    parseSamples.reset();
+    
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+    // TODO: Fix this, find a way to include train params in manifest ...
+//    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cut, iterations);
+    
+    // build
+    System.err.println("Training builder");
+    opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.BUILD, mdict);
+    HashSumEventStream hsbes = new HashSumEventStream(bes);
+    AbstractModel buildModel = TrainUtil.train(hsbes, mlParams.getSettings("build"));
+    manifestInfoEntries.put("Training-Builder-Eventhash", 
+        hsbes.calculateHashSum().toString(16));
+    
+    parseSamples.reset();
+    
+    // tag
+    POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples), 
+        mlParams.getParameters("tagger"), null, null); // <- pass on name space corrected TrainingParameters ...
+    
+    parseSamples.reset();
+    
+    // chunk
+    ChunkerModel chunkModel = ChunkerME.train(languageCode, 
+        new ChunkSampleStream(parseSamples), // <- pass on name space corrected TrainingParameters ...
+        new ChunkContextGenerator(), mlParams.getParameters("chunker"));
+    
+    parseSamples.reset();
+    
+    // check
+    System.err.println("Training checker");
+    opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.CHECK);
+    HashSumEventStream hskes = new HashSumEventStream(kes);
+    AbstractModel checkModel = TrainUtil.train(hskes, mlParams.getSettings("check"));
+    manifestInfoEntries.put("Training-Checker-Eventhash", 
+        hskes.calculateHashSum().toString(16));
+    
+    // TODO: Remove cast for HeadRules
+    return new ParserModel(languageCode, buildModel, checkModel,
+        posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
+        ParserType.CHUNKING, manifestInfoEntries);
+  }
+  
   public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
       throws IOException {
     

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java Wed May 18 18:34:54 2011
@@ -25,6 +25,7 @@ import java.util.Set;
 
 import opennlp.model.AbstractModel;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.chunker.Chunker;
 import opennlp.tools.chunker.ChunkerME;
@@ -46,6 +47,7 @@ import opennlp.tools.postag.POSTagger;
 import opennlp.tools.postag.POSTaggerME;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelType;
 
 /**
@@ -434,6 +436,57 @@ public class Parser extends AbstractBott
   }
 
   public static ParserModel train(String languageCode,
+      ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
+  throws IOException {
+    
+    // TODO: training code should be shared between two parsers
+    System.err.println("Building dictionary");
+    // TODO: Make cutoff configurable ... 
+    Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+    
+    parseSamples.reset();
+    
+    // tag
+    POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(
+        parseSamples), mlParams.getParameters("tagger"), null, null);
+    
+    parseSamples.reset();
+    
+    // chunk
+    ChunkerModel chunkModel = ChunkerME.train(languageCode, new ChunkSampleStream(
+        parseSamples), new ChunkContextGenerator(), mlParams.getParameters("chunker"));
+    
+    parseSamples.reset();
+    
+    // build
+    System.err.println("Training builder");
+    opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules,
+        ParserEventTypeEnum.BUILD, mdict);
+    AbstractModel buildModel = TrainUtil.train(bes, mlParams.getSettings("build"));
+    
+    parseSamples.reset();
+    
+    // check
+    System.err.println("Training checker");
+    opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules,
+        ParserEventTypeEnum.CHECK);
+    AbstractModel checkModel = TrainUtil.train(kes, mlParams.getSettings("check"));
+    
+    parseSamples.reset();
+    
+    // attach 
+    System.err.println("Training attacher");
+    opennlp.model.EventStream attachEvents = new ParserEventStream(parseSamples, rules,
+        ParserEventTypeEnum.ATTACH);
+    AbstractModel attachModel = TrainUtil.train(attachEvents, mlParams.getSettings("attach"));
+    
+    // TODO: Remove cast for HeadRules
+    return new ParserModel(languageCode, buildModel, checkModel,
+        attachModel, posModel, chunkModel, 
+        (opennlp.tools.parser.lang.en.HeadRules) rules, ParserType.TREEINSERT);
+  }
+  
+  public static ParserModel train(String languageCode,
       ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
       throws IOException {
     

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Wed May 18 18:34:54 2011
@@ -28,6 +28,7 @@ import java.util.StringTokenizer;
 
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.perceptron.SimplePerceptronSequenceTrainer;
 import opennlp.tools.dictionary.Dictionary;
@@ -36,6 +37,7 @@ import opennlp.tools.util.HashSumEventSt
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelType;
 import opennlp.tools.util.model.ModelUtil;
@@ -310,51 +312,46 @@ public class POSTaggerME implements POST
     
   }
 
-  public static POSModel train(String languageCode, ObjectStream<POSSample> samples, ModelType modelType, POSDictionary tagDictionary,
-      Dictionary ngramDictionary, int cutoff, int iterations) throws IOException {
-
-    POSContextGenerator contextGenerator = new DefaultPOSContextGenerator(ngramDictionary);
+  public static POSModel train(String languageCode, ObjectStream<POSSample> samples, TrainingParameters trainParams, 
+      POSDictionary tagDictionary, Dictionary ngramDictionary) throws IOException {
     
-    AbstractModel posModel = null;
+    POSContextGenerator contextGenerator = new DefaultPOSContextGenerator(ngramDictionary);
     
     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    // TODO: Store train params in model ... 
+//    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    
+    AbstractModel posModel;
     
-    if (modelType.equals(ModelType.MAXENT) ||
-        modelType.equals(ModelType.PERCEPTRON)) {
+    if (!TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+      
       EventStream es = new POSSampleEventStream(samples, contextGenerator);
       HashSumEventStream hses = new HashSumEventStream(es);
       
-      if (modelType.equals(ModelType.MAXENT)) {
-        posModel = opennlp.maxent.GIS.trainModel(iterations,
-            new TwoPassDataIndexer(hses, cutoff));
-      }
-      else if (modelType.equals(ModelType.PERCEPTRON)) {
-        boolean useAverage = true;
-
-        posModel = new opennlp.perceptron.PerceptronTrainer().trainModel(
-            iterations, new TwoPassDataIndexer(hses,
-            cutoff, false), cutoff, useAverage);
-      }
-      else {
-        throw new IllegalStateException();
-      }
+      posModel = TrainUtil.train(hses, trainParams.getSettings());
       
       manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
           hses.calculateHashSum().toString(16));
     }
-    else if (modelType.equals(ModelType.PERCEPTRON_SEQUENCE)) {
-      
-      POSSampleSequenceStream ss = new POSSampleSequenceStream(samples, contextGenerator);
-      boolean useAverage = true;
-      
-      posModel = new SimplePerceptronSequenceTrainer().trainModel(iterations, ss, cutoff,useAverage);
-    }
     else {
-      throw new IllegalStateException();
+      POSSampleSequenceStream ss = new POSSampleSequenceStream(samples, contextGenerator);
+
+      posModel = TrainUtil.train(ss, trainParams.getSettings());
     }
     
     return new POSModel(languageCode, posModel, tagDictionary,
         ngramDictionary, manifestInfoEntries);
   }
+  
+  public static POSModel train(String languageCode, ObjectStream<POSSample> samples, ModelType modelType, POSDictionary tagDictionary,
+      Dictionary ngramDictionary, int cutoff, int iterations) throws IOException {
+
+    TrainingParameters params = new TrainingParameters(); 
+    
+    params.put(TrainingParameters.ALGORITHM_PARAM, modelType.toString());
+    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    
+    return train(languageCode, samples, params, tagDictionary, ngramDictionary);
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Wed May 18 18:34:54 2011
@@ -30,8 +30,10 @@ import java.util.Map;
 
 import opennlp.maxent.GIS;
 import opennlp.maxent.GISModel;
+import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.lang.Factory;
 import opennlp.tools.util.HashSumEventStream;
@@ -39,6 +41,7 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -290,6 +293,29 @@ public class SentenceDetectorME implemen
     return new SentenceModel(languageCode, sentModel,
         useTokenEnd, abbreviations, manifestInfoEntries);
   }
+  
+  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+      boolean useTokenEnd, Dictionary abbreviations, TrainingParameters mlParams) throws IOException {
+    
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+//    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    
+    Factory factory = new Factory();
+    
+    // TODO: Fix the EventStream to throw exceptions when training goes wrong
+    EventStream eventStream = new SDEventStream(samples,
+        factory.createSentenceContextGenerator(languageCode),
+        factory.createEndOfSentenceScanner(languageCode));
+    
+    HashSumEventStream hses = new HashSumEventStream(eventStream);
+    AbstractModel sentModel = TrainUtil.train(hses, mlParams.getSettings());
+    
+    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
+        hses.calculateHashSum().toString(16));
+    
+    return new SentenceModel(languageCode, sentModel,
+        useTokenEnd, abbreviations, manifestInfoEntries);
+  }
 
   private static void usage() {
     System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Wed May 18 18:34:54 2011
@@ -28,12 +28,15 @@ import java.util.regex.Pattern;
 
 import opennlp.maxent.GIS;
 import opennlp.maxent.GISModel;
+import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
 import opennlp.tools.util.HashSumEventStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
 
@@ -189,6 +192,26 @@ public class TokenizerME extends Abstrac
     return spans;
   }
 
+  public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
+      boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+//    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    
+    EventStream eventStream = new TokSpanEventStream(samples,
+        useAlphaNumericOptimization);
+
+    HashSumEventStream hses = new HashSumEventStream(eventStream);
+    
+    AbstractModel maxentModel = TrainUtil.train(hses, mlParams.getSettings());
+
+    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
+        hses.calculateHashSum().toString(16));
+    
+    return new TokenizerModel(languageCode, maxentModel, 
+        useAlphaNumericOptimization, manifestInfoEntries);
+  }
+  
   /**
    * Trains a model for the {@link TokenizerME}.
    *

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java?rev=1124372&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java Wed May 18 18:34:54 2011
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+public class TrainingParameters {
+  
+  public static final String ALGORITHM_PARAM = "Algorithm";
+  
+  public static final String ITERATIONS_PARAM = "Iterations";
+  public static final String CUTOFF_PARAM = "Cutoff";
+  
+  private Map<String, String> parameters = new HashMap<String, String>();
+  
+  public TrainingParameters() {
+  }
+  
+  public TrainingParameters(InputStream in) throws IOException {
+    
+    Properties properties = new Properties();
+    properties.load(in);
+
+    for (Map.Entry<Object, Object> entry : properties.entrySet()) {
+      parameters.put((String) entry.getKey(), (String) entry.getValue());
+    }
+  }
+  
+  /**
+   * Retrieves the training algorithm name for a given name space.
+   * 
+   * @return the name or null if not set.
+   */
+  public String algorithm(String namespace) {
+    return parameters.get(namespace + "." + ALGORITHM_PARAM);
+  }
+  
+  /**
+   * Retrieves the training algorithm name.
+   * 
+   * @return the name or null if not set.
+   */
+  public String algorithm() {
+    return parameters.get(ALGORITHM_PARAM);
+  }
+  
+  /**
+   * Retrieves a map with the training parameters which have the passed name space.
+   * 
+   * @param namespace
+   * 
+   * @return a parameter map which can be passed to the train and validate methods.
+   */
+  public Map<String, String> getSettings(String namespace) {
+    
+    Map<String, String> trainingParams = new HashMap<String, String>();
+    
+    for (Map.Entry<String, String> entry : parameters.entrySet()) {
+      String key = entry.getKey();
+
+      if (namespace != null) {
+        String prefix = namespace + ".";
+        
+        if (key.startsWith(prefix))  {
+          key.substring(prefix.length());
+          trainingParams.put(key.substring(prefix.length()), entry.getValue());
+        }
+      }
+      else {
+        if (!key.contains(".")) {
+          trainingParams.put(key, entry.getValue());
+        }
+      }
+    }
+    
+    return Collections.unmodifiableMap(trainingParams);
+  }
+  
+  /** 
+   * Retrieves all parameters without a name space.
+   * 
+   * @return
+   */
+  public Map<String, String> getSettings() {
+    return getSettings(null);
+  }
+  
+  // reduces the params to contain only the params in the name space
+  public TrainingParameters getParameters(String namespace) {
+    
+    TrainingParameters params = new TrainingParameters();
+    
+    for (Map.Entry<String, String> entry : getSettings(namespace).entrySet()) {
+      params.put(entry.getKey(), entry.getValue());
+    }
+    
+    return params;
+  }
+  
+  public void put(String namespace, String key, String value) {
+    
+    if (namespace == null) {
+      parameters.put(key, value);
+    }
+    else {
+      parameters.put(namespace + "." + key, value);
+    }
+  }
+  
+  public void put(String key, String value) {
+    put(null, key, value);
+  }
+  
+  public void serialize(OutputStream out) throws IOException {
+    Properties properties = new Properties();
+    
+    for (Map.Entry<String, String> entry : parameters.entrySet()) {
+      properties.put(entry.getKey(), entry.getValue());
+    }
+    
+    properties.store(out, null);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain