You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/18 20:34:55 UTC
svn commit: r1124372 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
chunker/ cmdline/ cmdline/chunker/ cmdline/doccat/ cmdline/namefind/
cmdline/parser/ cmdline/postag/ cmdline/sentdetect/ cmdline/tokenizer/
doccat/ namefind/ ...
Author: joern
Date: Wed May 18 18:34:54 2011
New Revision: 1124372
URL: http://svn.apache.org/viewvc?rev=1124372&view=rev
Log:
OPENNLP-175 Updated cmd line interface and added train methods to train with trainig parameters file/object
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java Wed May 18 18:34:54 2011
@@ -30,6 +30,7 @@ import java.util.Map;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.HashSumEventStream;
@@ -38,6 +39,7 @@ import opennlp.tools.util.PlainTextByLin
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -206,6 +208,24 @@ public class ChunkerME implements Chunke
}
public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
+ ChunkerContextGenerator contextGenerator, TrainingParameters mlParams)
+ throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ EventStream es = new ChunkerEventStream(in, contextGenerator);
+ HashSumEventStream hses = new HashSumEventStream(es);
+
+ AbstractModel maxentModel = TrainUtil.train(hses, mlParams.getSettings());
+
+ manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
+ hses.calculateHashSum().toString(16));
+
+ return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
+ }
+
+ public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
int cutoff, int iterations, ChunkerContextGenerator contextGenerator)
throws IOException {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java Wed May 18 18:34:54 2011
@@ -23,6 +23,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
@@ -31,6 +32,8 @@ import java.util.Arrays;
import java.util.List;
import java.util.Locale;
+import opennlp.model.TrainUtil;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
/**
@@ -331,4 +334,34 @@ public final class CmdLineUtil {
System.err.println("IO Error while reading from stdin: " + e.getMessage());
throw new TerminateToolException(-1);
}
+
+ // its optional, passing null is allowed
+ public static TrainingParameters loadTrainingParameters(String paramFile) {
+
+ TrainingParameters params = null;
+
+ if (paramFile != null) {
+
+ checkInputFile("Training Parameter", new File(paramFile));
+
+ InputStream paramsIn = null;
+ try {
+ paramsIn = new FileInputStream(new File(paramFile));
+
+ params = new opennlp.tools.util.TrainingParameters(paramsIn);
+ } catch (IOException e) {
+ // TODO: print error and exit
+ e.printStackTrace();
+ }
+ finally {
+ try {
+ if (paramsIn != null)
+ paramsIn.close();
+ } catch (IOException e) {
+ }
+ }
+ }
+
+ return params;
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,10 +22,12 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.ChunkSampleStream;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.chunker.DefaultChunkerContextGenerator;
import opennlp.tools.cmdline.BasicTrainingParameters;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
@@ -76,6 +78,21 @@ public class ChunkerTrainerTool implemen
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
+
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -85,8 +102,14 @@ public class ChunkerTrainerTool implemen
ChunkerModel model;
try {
- model = ChunkerME.train(parameters.getLanguage(), sampleStream,
- parameters.getCutoff(), parameters.getNumberOfIterations());
+ if (mlParams == null) {
+ model = ChunkerME.train(parameters.getLanguage(), sampleStream,
+ parameters.getCutoff(), parameters.getNumberOfIterations());
+ }
+ else {
+ model = ChunkerME.train(parameters.getLanguage(), sampleStream,
+ new DefaultChunkerContextGenerator(), mlParams);
+ }
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.BasicTrainingParameters;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
@@ -75,6 +76,21 @@ public class DoccatTrainerTool implement
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
+
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -84,8 +100,14 @@ public class DoccatTrainerTool implement
DoccatModel model;
try {
- model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream,
+ if (mlParams == null) {
+ model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream,
parameters.getCutoff(), parameters.getNumberOfIterations());
+ }
+ else {
+ model = DocumentCategorizerME.train(parameters.getLanguage(), sampleStream,
+ mlParams);
+ }
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Wed May 18 18:34:54 2011
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Collections;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -75,18 +76,39 @@ public final class TokenNameFinderTraine
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
+
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-
+
CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
parameters.getEncoding());
TokenNameFinderModel model;
try {
+ if (mlParams == null) {
model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(),
sampleStream, Collections.<String, Object>emptyMap(),
parameters.getNumberOfIterations(), parameters.getCutoff());
+ }
+ else {
+ model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(), sampleStream, mlParams, null,
+ Collections.<String, Object>emptyMap());
+ }
}
catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTrainerTool.java Wed May 18 18:34:54 2011
@@ -25,6 +25,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -107,6 +108,21 @@ public final class ParserTrainerTool imp
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
+
ObjectStream<Parse> sampleStream = openTrainingData(new File(CmdLineUtil.getParameter("-data", args)), parameters.getEncoding());
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -119,19 +135,35 @@ public final class ParserTrainerTool imp
new InputStreamReader(new FileInputStream(new File(CmdLineUtil.getParameter("-head-rules", args))),
parameters.getEncoding()));
- if (ParserType.CHUNKING.equals(parameters.getParserType())) {
- model = opennlp.tools.parser.chunking.Parser.train(
- parameters.getLanguage(), sampleStream, rules,
- parameters.getNumberOfIterations(), parameters.getCutoff());
- }
- else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
- model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules, parameters.getNumberOfIterations(),
- parameters.getCutoff());
+ if (mlParams == null) {
+ if (ParserType.CHUNKING.equals(parameters.getParserType())) {
+ model = opennlp.tools.parser.chunking.Parser.train(
+ parameters.getLanguage(), sampleStream, rules,
+ parameters.getNumberOfIterations(), parameters.getCutoff());
+ }
+ else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
+ model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules, parameters.getNumberOfIterations(),
+ parameters.getCutoff());
+ }
+ else {
+ throw new IllegalStateException();
+ }
}
else {
- throw new IllegalStateException();
+ if (ParserType.CHUNKING.equals(parameters.getParserType())) {
+ model = opennlp.tools.parser.chunking.Parser.train(
+ parameters.getLanguage(), sampleStream, rules,
+ mlParams);
+ }
+ else if (ParserType.TREEINSERT.equals(parameters.getParserType())) {
+ model = opennlp.tools.parser.treeinsert.Parser.train(parameters.getLanguage(), sampleStream, rules,
+ mlParams);
+ }
+ else {
+ throw new IllegalStateException();
+ }
+
}
-
}
catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -74,6 +75,14 @@ public final class POSTaggerTrainerTool
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null && !TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -90,9 +99,15 @@ public final class POSTaggerTrainerTool
tagdict = new POSDictionary(parameters.getDictionaryPath());
}
- // depending on model and sequence choose training method
- model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
- sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+ if (mlParams == null) {
+ // depending on model and sequence choose training method
+ model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
+ sampleStream, parameters.getModel(), tagdict, null, parameters.getCutoff(), parameters.getNumberOfIterations());
+ }
+ else {
+ model = opennlp.tools.postag.POSTaggerME.train(parameters.getLanguage(),
+ sampleStream, mlParams, tagdict, null);
+ }
}
catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -73,6 +74,21 @@ public final class SentenceDetectorTrain
System.out.println(getHelp());
throw new TerminateToolException(1);
}
+
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
@@ -80,11 +96,17 @@ public final class SentenceDetectorTrain
CmdLineUtil.checkOutputFile("sentence detector model", modelOutFile);
ObjectStream<SentenceSample> sampleStream =
openSampleData("Training", trainingDataInFile, parameters.getEncoding());
-
+
SentenceModel model;
try {
- model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null,
- parameters.getCutoff(), parameters.getNumberOfIterations());
+ if (mlParams == null) {
+ model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null,
+ parameters.getCutoff(), parameters.getNumberOfIterations());
+ }
+ else {
+ model = SentenceDetectorME.train(parameters.getLanguage(), sampleStream, true, null,
+ mlParams);
+ }
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java Wed May 18 18:34:54 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
+import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
@@ -73,19 +74,42 @@ public final class TokenizerTrainerTool
throw new TerminateToolException(1);
}
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args));
+
+ if (mlParams != null) {
+ if (!TrainUtil.isValid(mlParams.getSettings())) {
+ System.err.println("Training parameters file is invalid!");
+ throw new TerminateToolException(-1);
+ }
+
+ if (TrainUtil.isSequenceTraining(mlParams.getSettings())) {
+ System.err.println("Sequence training is not supported!");
+ throw new TerminateToolException(-1);
+ }
+ }
+
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-
+
CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile);
ObjectStream<TokenSample> sampleStream = openSampleData("Training",
trainingDataInFile, parameters.getEncoding());
TokenizerModel model;
try {
- model = opennlp.tools.tokenize.TokenizerME.train(
- parameters.getLanguage(), sampleStream,
- parameters.isAlphaNumericOptimizationEnabled(),
- parameters.getCutoff(), parameters.getNumberOfIterations());
+ if (mlParams == null) {
+ model = opennlp.tools.tokenize.TokenizerME.train(
+ parameters.getLanguage(), sampleStream,
+ parameters.isAlphaNumericOptimizationEnabled(),
+ parameters.getCutoff(), parameters.getNumberOfIterations());
+ }
+ else {
+ model = opennlp.tools.tokenize.TokenizerME.train(
+ parameters.getLanguage(), sampleStream,
+ parameters.isAlphaNumericOptimizationEnabled(),
+ mlParams);
+ }
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java Wed May 18 18:34:54 2011
@@ -26,10 +26,12 @@ import java.util.Map;
import opennlp.maxent.GIS;
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
/**
@@ -142,6 +144,21 @@ public class DocumentCategorizerME imple
return GIS.trainModel(100, new TwoPassDataIndexer(eventStream, 5));
}
+
+ public static DoccatModel train(String languageCode, ObjectStream<DocumentSample> samples,
+ TrainingParameters mlParams, FeatureGenerator... featureGenerators)
+ throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ AbstractModel model = TrainUtil.train(
+ new DocumentCategorizerEventStream(samples, featureGenerators),
+ mlParams.getSettings());
+
+ return new DoccatModel(languageCode, model, manifestInfoEntries);
+ }
+
/**
* Trains a document categorizer model with custom feature generation.
*
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Wed May 18 18:34:54 2011
@@ -37,6 +37,7 @@ import opennlp.maxent.GISModel;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.HashSumEventStream;
@@ -45,6 +46,7 @@ import opennlp.tools.util.PlainTextByLin
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
import opennlp.tools.util.featuregen.CachedFeatureGenerator;
@@ -311,7 +313,38 @@ public class NameFinderME implements Tok
return sprobs;
}
-
+ public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
+ TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources) throws IOException {
+
+ if (TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+ throw new IllegalArgumentException("Sequence training is not supported!");
+ }
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ AdaptiveFeatureGenerator featureGenerator;
+
+ if (generator != null)
+ featureGenerator = generator;
+ else
+ featureGenerator = createFeatureGenerator();
+
+ EventStream eventStream = new NameFinderEventStream(samples, type,
+ new DefaultNameContextGenerator(featureGenerator));
+ HashSumEventStream hses = new HashSumEventStream(eventStream);
+
+ AbstractModel nameFinderModel = TrainUtil.train(hses, trainParams.getSettings());
+
+// AbstractModel nameFinderModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
+
+ manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
+ hses.calculateHashSum().toString(16));
+
+ return new TokenNameFinderModel(languageCode, nameFinderModel,
+ resources, manifestInfoEntries);
+ }
+
/**
* Trains a name finder model.
*
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java Wed May 18 18:34:54 2011
@@ -30,6 +30,7 @@ import java.util.Map;
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.chunker.ChunkSample;
import opennlp.tools.chunker.Chunker;
@@ -56,6 +57,7 @@ import opennlp.tools.util.InvalidFormatE
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelType;
import opennlp.tools.util.model.ModelUtil;
@@ -275,6 +277,56 @@ public class Parser extends AbstractBott
return opennlp.maxent.GIS.trainModel(iterations, new TwoPassDataIndexer(es, cut));
}
+ public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
+ throws IOException {
+
+ System.err.println("Building dictionary");
+ // TODO: Discuss and make dict cutoff configurable
+ Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+
+ parseSamples.reset();
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+ // TODO: Fix this, find a way to include train params in manifest ...
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cut, iterations);
+
+ // build
+ System.err.println("Training builder");
+ opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.BUILD, mdict);
+ HashSumEventStream hsbes = new HashSumEventStream(bes);
+ AbstractModel buildModel = TrainUtil.train(hsbes, mlParams.getSettings("build"));
+ manifestInfoEntries.put("Training-Builder-Eventhash",
+ hsbes.calculateHashSum().toString(16));
+
+ parseSamples.reset();
+
+ // tag
+ POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples),
+ mlParams.getParameters("tagger"), null, null); // <- pass on name space corrected TrainingParameters ...
+
+ parseSamples.reset();
+
+ // chunk
+ ChunkerModel chunkModel = ChunkerME.train(languageCode,
+ new ChunkSampleStream(parseSamples), // <- pass on name space corrected TrainingParameters ...
+ new ChunkContextGenerator(), mlParams.getParameters("chunker"));
+
+ parseSamples.reset();
+
+ // check
+ System.err.println("Training checker");
+ opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.CHECK);
+ HashSumEventStream hskes = new HashSumEventStream(kes);
+ AbstractModel checkModel = TrainUtil.train(hskes, mlParams.getSettings("check"));
+ manifestInfoEntries.put("Training-Checker-Eventhash",
+ hskes.calculateHashSum().toString(16));
+
+ // TODO: Remove cast for HeadRules
+ return new ParserModel(languageCode, buildModel, checkModel,
+ posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
+ ParserType.CHUNKING, manifestInfoEntries);
+ }
+
public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
throws IOException {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java Wed May 18 18:34:54 2011
@@ -25,6 +25,7 @@ import java.util.Set;
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
@@ -46,6 +47,7 @@ import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelType;
/**
@@ -434,6 +436,57 @@ public class Parser extends AbstractBott
}
public static ParserModel train(String languageCode,
+ ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
+ throws IOException {
+
+ // TODO: training code should be shared between two parsers
+ System.err.println("Building dictionary");
+ // TODO: Make cutoff configurable ...
+ Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+
+ parseSamples.reset();
+
+ // tag
+ POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(
+ parseSamples), mlParams.getParameters("tagger"), null, null);
+
+ parseSamples.reset();
+
+ // chunk
+ ChunkerModel chunkModel = ChunkerME.train(languageCode, new ChunkSampleStream(
+ parseSamples), new ChunkContextGenerator(), mlParams.getParameters("chunker"));
+
+ parseSamples.reset();
+
+ // build
+ System.err.println("Training builder");
+ opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules,
+ ParserEventTypeEnum.BUILD, mdict);
+ AbstractModel buildModel = TrainUtil.train(bes, mlParams.getSettings("build"));
+
+ parseSamples.reset();
+
+ // check
+ System.err.println("Training checker");
+ opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules,
+ ParserEventTypeEnum.CHECK);
+ AbstractModel checkModel = TrainUtil.train(kes, mlParams.getSettings("check"));
+
+ parseSamples.reset();
+
+ // attach
+ System.err.println("Training attacher");
+ opennlp.model.EventStream attachEvents = new ParserEventStream(parseSamples, rules,
+ ParserEventTypeEnum.ATTACH);
+ AbstractModel attachModel = TrainUtil.train(attachEvents, mlParams.getSettings("attach"));
+
+ // TODO: Remove cast for HeadRules
+ return new ParserModel(languageCode, buildModel, checkModel,
+ attachModel, posModel, chunkModel,
+ (opennlp.tools.parser.lang.en.HeadRules) rules, ParserType.TREEINSERT);
+ }
+
+ public static ParserModel train(String languageCode,
ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
throws IOException {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Wed May 18 18:34:54 2011
@@ -28,6 +28,7 @@ import java.util.StringTokenizer;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.perceptron.SimplePerceptronSequenceTrainer;
import opennlp.tools.dictionary.Dictionary;
@@ -36,6 +37,7 @@ import opennlp.tools.util.HashSumEventSt
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelType;
import opennlp.tools.util.model.ModelUtil;
@@ -310,51 +312,46 @@ public class POSTaggerME implements POST
}
- public static POSModel train(String languageCode, ObjectStream<POSSample> samples, ModelType modelType, POSDictionary tagDictionary,
- Dictionary ngramDictionary, int cutoff, int iterations) throws IOException {
-
- POSContextGenerator contextGenerator = new DefaultPOSContextGenerator(ngramDictionary);
+ public static POSModel train(String languageCode, ObjectStream<POSSample> samples, TrainingParameters trainParams,
+ POSDictionary tagDictionary, Dictionary ngramDictionary) throws IOException {
- AbstractModel posModel = null;
+ POSContextGenerator contextGenerator = new DefaultPOSContextGenerator(ngramDictionary);
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+ // TODO: Store train params in model ...
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ AbstractModel posModel;
- if (modelType.equals(ModelType.MAXENT) ||
- modelType.equals(ModelType.PERCEPTRON)) {
+ if (!TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+
EventStream es = new POSSampleEventStream(samples, contextGenerator);
HashSumEventStream hses = new HashSumEventStream(es);
- if (modelType.equals(ModelType.MAXENT)) {
- posModel = opennlp.maxent.GIS.trainModel(iterations,
- new TwoPassDataIndexer(hses, cutoff));
- }
- else if (modelType.equals(ModelType.PERCEPTRON)) {
- boolean useAverage = true;
-
- posModel = new opennlp.perceptron.PerceptronTrainer().trainModel(
- iterations, new TwoPassDataIndexer(hses,
- cutoff, false), cutoff, useAverage);
- }
- else {
- throw new IllegalStateException();
- }
+ posModel = TrainUtil.train(hses, trainParams.getSettings());
manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
hses.calculateHashSum().toString(16));
}
- else if (modelType.equals(ModelType.PERCEPTRON_SEQUENCE)) {
-
- POSSampleSequenceStream ss = new POSSampleSequenceStream(samples, contextGenerator);
- boolean useAverage = true;
-
- posModel = new SimplePerceptronSequenceTrainer().trainModel(iterations, ss, cutoff,useAverage);
- }
else {
- throw new IllegalStateException();
+ POSSampleSequenceStream ss = new POSSampleSequenceStream(samples, contextGenerator);
+
+ posModel = TrainUtil.train(ss, trainParams.getSettings());
}
return new POSModel(languageCode, posModel, tagDictionary,
ngramDictionary, manifestInfoEntries);
}
+
+ public static POSModel train(String languageCode, ObjectStream<POSSample> samples, ModelType modelType, POSDictionary tagDictionary,
+ Dictionary ngramDictionary, int cutoff, int iterations) throws IOException {
+
+ TrainingParameters params = new TrainingParameters();
+
+ params.put(TrainingParameters.ALGORITHM_PARAM, modelType.toString());
+ params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+
+ return train(languageCode, samples, params, tagDictionary, ngramDictionary);
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Wed May 18 18:34:54 2011
@@ -30,8 +30,10 @@ import java.util.Map;
import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
+import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.lang.Factory;
import opennlp.tools.util.HashSumEventStream;
@@ -39,6 +41,7 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -290,6 +293,29 @@ public class SentenceDetectorME implemen
return new SentenceModel(languageCode, sentModel,
useTokenEnd, abbreviations, manifestInfoEntries);
}
+
+ public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+ boolean useTokenEnd, Dictionary abbreviations, TrainingParameters mlParams) throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ Factory factory = new Factory();
+
+ // TODO: Fix the EventStream to throw exceptions when training goes wrong
+ EventStream eventStream = new SDEventStream(samples,
+ factory.createSentenceContextGenerator(languageCode),
+ factory.createEndOfSentenceScanner(languageCode));
+
+ HashSumEventStream hses = new HashSumEventStream(eventStream);
+ AbstractModel sentModel = TrainUtil.train(hses, mlParams.getSettings());
+
+ manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
+ hses.calculateHashSum().toString(16));
+
+ return new SentenceModel(languageCode, sentModel,
+ useTokenEnd, abbreviations, manifestInfoEntries);
+ }
private static void usage() {
System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1124372&r1=1124371&r2=1124372&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Wed May 18 18:34:54 2011
@@ -28,12 +28,15 @@ import java.util.regex.Pattern;
import opennlp.maxent.GIS;
import opennlp.maxent.GISModel;
+import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
+import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -189,6 +192,26 @@ public class TokenizerME extends Abstrac
return spans;
}
+ public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
+ boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+// ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+
+ EventStream eventStream = new TokSpanEventStream(samples,
+ useAlphaNumericOptimization);
+
+ HashSumEventStream hses = new HashSumEventStream(eventStream);
+
+ AbstractModel maxentModel = TrainUtil.train(hses, mlParams.getSettings());
+
+ manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
+ hses.calculateHashSum().toString(16));
+
+ return new TokenizerModel(languageCode, maxentModel,
+ useAlphaNumericOptimization, manifestInfoEntries);
+ }
+
/**
* Trains a model for the {@link TokenizerME}.
*
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java?rev=1124372&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java Wed May 18 18:34:54 2011
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+public class TrainingParameters {
+
+ public static final String ALGORITHM_PARAM = "Algorithm";
+
+ public static final String ITERATIONS_PARAM = "Iterations";
+ public static final String CUTOFF_PARAM = "Cutoff";
+
+ private Map<String, String> parameters = new HashMap<String, String>();
+
+ public TrainingParameters() {
+ }
+
+ public TrainingParameters(InputStream in) throws IOException {
+
+ Properties properties = new Properties();
+ properties.load(in);
+
+ for (Map.Entry<Object, Object> entry : properties.entrySet()) {
+ parameters.put((String) entry.getKey(), (String) entry.getValue());
+ }
+ }
+
+ /**
+ * Retrieves the training algorithm name for a given name space.
+ *
+ * @return the name or null if not set.
+ */
+ public String algorithm(String namespace) {
+ return parameters.get(namespace + "." + ALGORITHM_PARAM);
+ }
+
+ /**
+ * Retrieves the training algorithm name.
+ *
+ * @return the name or null if not set.
+ */
+ public String algorithm() {
+ return parameters.get(ALGORITHM_PARAM);
+ }
+
+ /**
+ * Retrieves a map with the training parameters which have the passed name space.
+ *
+ * @param namespace
+ *
+ * @return a parameter map which can be passed to the train and validate methods.
+ */
+ public Map<String, String> getSettings(String namespace) {
+
+ Map<String, String> trainingParams = new HashMap<String, String>();
+
+ for (Map.Entry<String, String> entry : parameters.entrySet()) {
+ String key = entry.getKey();
+
+ if (namespace != null) {
+ String prefix = namespace + ".";
+
+ if (key.startsWith(prefix)) {
+ key.substring(prefix.length());
+ trainingParams.put(key.substring(prefix.length()), entry.getValue());
+ }
+ }
+ else {
+ if (!key.contains(".")) {
+ trainingParams.put(key, entry.getValue());
+ }
+ }
+ }
+
+ return Collections.unmodifiableMap(trainingParams);
+ }
+
+ /**
+ * Retrieves all parameters without a name space.
+ *
+ * @return
+ */
+ public Map<String, String> getSettings() {
+ return getSettings(null);
+ }
+
+ // reduces the params to contain only the params in the name space
+ public TrainingParameters getParameters(String namespace) {
+
+ TrainingParameters params = new TrainingParameters();
+
+ for (Map.Entry<String, String> entry : getSettings(namespace).entrySet()) {
+ params.put(entry.getKey(), entry.getValue());
+ }
+
+ return params;
+ }
+
+ public void put(String namespace, String key, String value) {
+
+ if (namespace == null) {
+ parameters.put(key, value);
+ }
+ else {
+ parameters.put(namespace + "." + key, value);
+ }
+ }
+
+ public void put(String key, String value) {
+ put(null, key, value);
+ }
+
+ public void serialize(OutputStream out) throws IOException {
+ Properties properties = new Properties();
+
+ for (Map.Entry<String, String> entry : parameters.entrySet()) {
+ properties.put(entry.getKey(), entry.getValue());
+ }
+
+ properties.store(out, null);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
------------------------------------------------------------------------------
svn:mime-type = text/plain