You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/19 16:56:45 UTC

svn commit: r1124880 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: chunker/ doccat/ namefind/ postag/ sentdetect/ tokenize/

Author: joern
Date: Thu May 19 14:56:45 2011
New Revision: 1124880

URL: http://svn.apache.org/viewvc?rev=1124880&view=rev
Log:
OPENNLP-175 Removed duplicate code

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java Thu May 19 14:56:45 2011
@@ -219,19 +219,12 @@ public class ChunkerME implements Chunke
       int cutoff, int iterations, ChunkerContextGenerator contextGenerator)
       throws IOException {
     
-    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
     
-    EventStream es = new ChunkerEventStream(in, contextGenerator);
-    HashSumEventStream hses = new HashSumEventStream(es);
-    
-    AbstractModel maxentModel = opennlp.maxent.GIS.trainModel(iterations, 
-        new TwoPassDataIndexer(hses, cutoff));
-    
-    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
-        hses.calculateHashSum().toString(16));
-    
-    return new ChunkerModel(lang, maxentModel, manifestInfoEntries);
+    return train(lang, in, contextGenerator, mlParams);
   }
   
   /**

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java Thu May 19 14:56:45 2011
@@ -172,13 +172,12 @@ public class DocumentCategorizerME imple
   public static DoccatModel train(String languageCode, ObjectStream<DocumentSample> samples, int cutoff, int iterations, FeatureGenerator... featureGenerators)
       throws IOException {
     
-    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
     
-    AbstractModel model = GIS.trainModel(iterations, new TwoPassDataIndexer(
-        new DocumentCategorizerEventStream(samples, featureGenerators), cutoff));
-    
-    return new DoccatModel(languageCode, model, manifestInfoEntries);
+    return train(languageCode, samples, mlParams);
   }
   
   /**

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Thu May 19 14:56:45 2011
@@ -355,26 +355,12 @@ public class NameFinderME implements Tok
        AdaptiveFeatureGenerator generator, final Map<String, Object> resources, 
        int iterations, int cutoff) throws IOException {
      
-     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-     ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+     TrainingParameters mlParams = new TrainingParameters();
+     mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+     mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+     mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
      
-     AdaptiveFeatureGenerator featureGenerator;
-     
-     if (generator != null)
-       featureGenerator = generator;
-     else 
-       featureGenerator = createFeatureGenerator();
-     
-     EventStream eventStream = new NameFinderEventStream(samples, type,
-         new DefaultNameContextGenerator(featureGenerator));
-     HashSumEventStream hses = new HashSumEventStream(eventStream);
-     AbstractModel nameFinderModel = GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
-     
-     manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
-         hses.calculateHashSum().toString(16));
-     
-     return new TokenNameFinderModel(languageCode, nameFinderModel,
-         resources, manifestInfoEntries);
+     return train(languageCode, type, samples, mlParams, generator, resources);
    }
 
    public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples, 

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Thu May 19 14:56:45 2011
@@ -29,18 +29,13 @@ import java.util.StringTokenizer;
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.TrainUtil;
-import opennlp.model.TwoPassDataIndexer;
-import opennlp.perceptron.SimplePerceptronSequenceTrainer;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.BeamSearch;
-import opennlp.tools.util.HashSumEventStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
 import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelType;
-import opennlp.tools.util.model.ModelUtil;
 
 /**
  * A part-of-speech tagger that uses maximum entropy.  Tries to predict whether

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Thu May 19 14:56:45 2011
@@ -266,29 +266,6 @@ public class SentenceDetectorME implemen
     return true;
   }
   
-  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
-      boolean useTokenEnd, Dictionary abbreviations) throws IOException {
-    return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
-  }
-  
-  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
-      boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {
-
-    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
-    
-    Factory factory = new Factory();
-
-    // TODO: Fix the EventStream to throw exceptions when training goes wrong
-    EventStream eventStream = new SDEventStream(samples,
-        factory.createSentenceContextGenerator(languageCode),
-        factory.createEndOfSentenceScanner(languageCode));
-    
-    GISModel sentModel = GIS.trainModel(eventStream, iterations, cutoff);
-
-    return new SentenceModel(languageCode, sentModel,
-        useTokenEnd, abbreviations, manifestInfoEntries);
-  }
   
   public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
       boolean useTokenEnd, Dictionary abbreviations, TrainingParameters mlParams) throws IOException {
@@ -302,13 +279,25 @@ public class SentenceDetectorME implemen
         factory.createSentenceContextGenerator(languageCode),
         factory.createEndOfSentenceScanner(languageCode));
     
-    HashSumEventStream hses = new HashSumEventStream(eventStream);
-    AbstractModel sentModel = TrainUtil.train(hses, mlParams.getSettings(), manifestInfoEntries);
-    
-    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
-        hses.calculateHashSum().toString(16));
+    AbstractModel sentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);
     
     return new SentenceModel(languageCode, sentModel,
         useTokenEnd, abbreviations, manifestInfoEntries);
   }
+  
+  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+      boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    
+    return train(languageCode, samples, useTokenEnd, abbreviations, mlParams);
+ }
+  
+  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
+      boolean useTokenEnd, Dictionary abbreviations) throws IOException {
+    return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1124880&r1=1124879&r2=1124880&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Thu May 19 14:56:45 2011
@@ -225,21 +225,12 @@ public class TokenizerME extends Abstrac
   public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
       boolean useAlphaNumericOptimization, int cutoff, int iterations) throws IOException {
 
-    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
     
-    EventStream eventStream = new TokSpanEventStream(samples,
-        useAlphaNumericOptimization);
-
-    HashSumEventStream hses = new HashSumEventStream(eventStream);
-    GISModel maxentModel =
-        GIS.trainModel(iterations, new TwoPassDataIndexer(hses, cutoff));
-
-    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
-        hses.calculateHashSum().toString(16));
-    
-    return new TokenizerModel(languageCode, maxentModel, 
-        useAlphaNumericOptimization, manifestInfoEntries);
+    return train(languageCode, samples, useAlphaNumericOptimization, mlParams);
   }