You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/19 16:56:45 UTC
svn commit: r1124880 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
chunker/ doccat/ namefind/ postag/ sentdetect/ tokenize/
Author: joern
Date: Thu May 19 14:56:45 2011
New Revision: 1124880
URL: http://svn.apache.org/viewvc?rev=1124880&view=rev
Log:
OPENNLP-175 Removed duplicate code
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java Thu May 19 14:56:45 2011
@@ -219,19 +219,12 @@ public class ChunkerME implements Chunke
int cutoff, int iterations, ChunkerContextGenerator contextGenerator)
throws IOException {
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
- EventStream es = new ChunkerEventStream(in, contextGenerator);
- HashSumEventStream hses = new HashSumEventStream(es);
-
- AbstractModel maxentModel = opennlp.maxent.GIS.trainModel(iterations,
- new TwoPassDataIndexer(hses, cutoff));
-
- manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
- hses.calculateHashSum().toString(16));
-
- return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
+ return train(lang, in, contextGenerator, mlParams);
}
/**
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java Thu May 19 14:56:45 2011
@@ -172,13 +172,12 @@ public class DocumentCategorizerME imple
public static DoccatModel train(String languageCode, ObjectStream<DocumentSample> samples, int cutoff, int iterations, FeatureGenerator... featureGenerators)
throws IOException {
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
- AbstractModel model = GIS.trainModel(iterations, new TwoPassDataIndexer(
- new DocumentCategorizerEventStream(samples, featureGenerators), cutoff));
-
- return new DoccatModel(languageCode, model, manifestInfoEntries);
+ return train(languageCode, samples, mlParams);
}
/**
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Thu May 19 14:56:45 2011
@@ -355,26 +355,12 @@ public class NameFinderME implements Tok
AdaptiveFeatureGenerator generator, final Map<String, Object> resources,
int iterations, int cutoff) throws IOException {
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
- AdaptiveFeatureGenerator featureGenerator;
-
- if (generator != null)
- featureGenerator = generator;
- else
- featureGenerator = createFeatureGenerator();
-
- EventStream eventStream = new NameFinderEventStream(samples, type,
- new DefaultNameContextGenerator(featureGenerator));
- HashSumEventStream hses = new HashSumEventStream(eventStream);
- AbstractModel nameFinderModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
-
- manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
- hses.calculateHashSum().toString(16));
-
- return new TokenNameFinderModel(languageCode, nameFinderModel,
- resources, manifestInfoEntries);
+ return train(languageCode, type, samples, mlParams, generator, resources);
}
public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Thu May 19 14:56:45 2011
@@ -29,18 +29,13 @@ import java.util.StringTokenizer;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.TrainUtil;
-import opennlp.model.TwoPassDataIndexer;
-import opennlp.perceptron.SimplePerceptronSequenceTrainer;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.BeamSearch;
-import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelType;
-import opennlp.tools.util.model.ModelUtil;
/**
* A part-of-speech tagger that uses maximum entropy. Tries to predict whether
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Thu May 19 14:56:45 2011
@@ -266,29 +266,6 @@ public class SentenceDetectorME implemen
return true;
}
- public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
- boolean useTokenEnd, Dictionary abbreviations) throws IOException {
- return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
- }
-
- public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
- boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {
-
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
-
- Factory factory = new Factory();
-
- // TODO: Fix the EventStream to throw exceptions when training goes wrong
- EventStream eventStream = new SDEventStream(samples,
- factory.createSentenceContextGenerator(languageCode),
- factory.createEndOfSentenceScanner(languageCode));
-
- GISModel sentModel = GIS.trainModel(eventStream, iterations, cutoff);
-
- return new SentenceModel(languageCode, sentModel,
- useTokenEnd, abbreviations, manifestInfoEntries);
- }
public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
boolean useTokenEnd, Dictionary abbreviations, TrainingParameters mlParams) throws IOException {
@@ -302,13 +279,25 @@ public class SentenceDetectorME implemen
factory.createSentenceContextGenerator(languageCode),
factory.createEndOfSentenceScanner(languageCode));
- HashSumEventStream hses = new HashSumEventStream(eventStream);
- AbstractModel sentModel = TrainUtil.train(hses, mlParams.getSettings(), manifestInfoEntries);
-
- manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
- hses.calculateHashSum().toString(16));
+ AbstractModel sentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);
return new SentenceModel(languageCode, sentModel,
useTokenEnd, abbreviations, manifestInfoEntries);
}
+
+ public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+ boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {
+
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+
+ return train(languageCode, samples, useTokenEnd, abbreviations, mlParams);
+ }
+
+ public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+ boolean useTokenEnd, Dictionary abbreviations) throws IOException {
+ return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Thu May 19 14:56:45 2011
@@ -225,21 +225,12 @@ public class TokenizerME extends Abstrac
public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
boolean useAlphaNumericOptimization, int cutoff, int iterations) throws IOException {
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
- EventStream eventStream = new TokSpanEventStream(samples,
- useAlphaNumericOptimization);
-
- HashSumEventStream hses = new HashSumEventStream(eventStream);
- GISModel maxentModel =
- GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
-
- manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
- hses.calculateHashSum().toString(16));
-
- return new TokenizerModel(languageCode, maxentModel,
- useAlphaNumericOptimization, manifestInfoEntries);
+ return train(languageCode, samples, useAlphaNumericOptimization, mlParams);
}