You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/03/10 22:14:40 UTC

svn commit: r1576089 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java

Author: joern
Date: Mon Mar 10 21:14:40 2014
New Revision: 1576089

URL: http://svn.apache.org/r1576089
Log:
OPENNLP-580 Added a factory to construct the name finder

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1576089&r1=1576088&r2=1576089&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Mon Mar 10 21:14:40 2014
@@ -48,6 +48,7 @@ import opennlp.tools.util.featuregen.Ada
 import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
 import opennlp.tools.util.featuregen.BigramNameFeatureGenerator;
 import opennlp.tools.util.featuregen.CachedFeatureGenerator;
+import opennlp.tools.util.featuregen.FeatureGeneratorFactory;
 import opennlp.tools.util.featuregen.FeatureGeneratorResourceProvider;
 import opennlp.tools.util.featuregen.GeneratorFactory;
 import opennlp.tools.util.featuregen.OutcomePriorFeatureGenerator;
@@ -83,7 +84,17 @@ public class NameFinderME implements Tok
   private SequenceValidator<String> sequenceValidator;
 
   public NameFinderME(TokenNameFinderModel model) {
-    this(model, DEFAULT_BEAM_SIZE);
+    
+    TokenNameFinderFactory factory = model.getFactory();
+    
+    seqCodec = factory.createSequenceCodec();
+    sequenceValidator = seqCodec.createSequenceValidator();
+    this.model = model.getNameFinderSequenceModel();
+    contextGenerator = factory.createContextGenerator();
+    
+    // TODO: We should deprecate this. And come up with a better solution!
+    contextGenerator.addFeatureGenerator(
+          new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
   }
 
   /**
@@ -94,12 +105,15 @@ public class NameFinderME implements Tok
    * 
    * @deprecated the beam size is now configured during training time in the trainer parameter
    * file via beamSearch.beamSize
+   * 
+   * @deprecated Use {@link #NameFinderME(TokenNameFinderModel)} instead and use
+   * the {@link TokenNameFinderFactory} to configure it.
    */
   @Deprecated
   public NameFinderME(TokenNameFinderModel model, AdaptiveFeatureGenerator generator, int beamSize,
       SequenceValidator<String> sequenceValidator) {
     
-    seqCodec = model.createSequenceCodec();
+    seqCodec = model.getFactory().createSequenceCodec();
     
     this.sequenceValidator = sequenceValidator;
    
@@ -135,10 +149,6 @@ public class NameFinderME implements Tok
 
     if (this.sequenceValidator == null)
       this.sequenceValidator = new NameFinderSequenceValidator();
-
-    // TODO: How to combine different sequence validators ?!
-    
-    this.sequenceValidator = seqCodec.createSequenceValidator();
   }
 
   /**
@@ -158,7 +168,7 @@ public class NameFinderME implements Tok
     this(model, null, beamSize);
   }
 
-  private static AdaptiveFeatureGenerator createFeatureGenerator() {
+  static AdaptiveFeatureGenerator createFeatureGenerator() {
    return new CachedFeatureGenerator(
          new AdaptiveFeatureGenerator[]{
            new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
@@ -284,6 +294,61 @@ public class NameFinderME implements Tok
      return sprobs;
    }
 
+   public static TokenNameFinderModel train(String languageCode, String type, 
+       ObjectStream<NameSample> samples, TrainingParameters trainParams,
+       TokenNameFinderFactory factory) throws IOException {
+     String beamSizeString = trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
+     
+     int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
+     if (beamSizeString != null) {
+       beamSize = Integer.parseInt(beamSizeString);
+     }
+     
+     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+
+     MaxentModel nameFinderModel = null;
+     
+     SequenceClassificationModel<String> seqModel = null;
+     
+     TrainerType trainerType = TrainerFactory.getTrainerType(trainParams.getSettings());
+     
+     if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
+       ObjectStream<Event> eventStream = new NameFinderEventStream(samples, type,
+           factory.createContextGenerator(), factory.createSequenceCodec());
+
+       EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
+       nameFinderModel = trainer.train(eventStream);
+     }
+     // TODO: Maybe it is not a good idea, that these two don't use the context generator ?!
+     // These also don't use the sequence codec ?!
+     else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
+       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, factory.createContextGenerator());
+
+       EventModelSequenceTrainer trainer = TrainerFactory.getEventModelSequenceTrainer(
+           trainParams.getSettings(), manifestInfoEntries);
+       nameFinderModel = trainer.train(ss);
+     }
+     else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
+       SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
+           trainParams.getSettings(), manifestInfoEntries);
+       
+       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, factory.createContextGenerator(), false);
+       seqModel = trainer.train(ss);
+     }
+     else {
+       throw new IllegalStateException("Unexpected trainer type!");
+     }
+     
+     if (seqModel != null) {
+       return new TokenNameFinderModel(languageCode, seqModel, null,
+           factory.getResources(), manifestInfoEntries, factory.getSequenceCodec());
+     }
+     else {
+       return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize, null,
+           factory.getResources(), manifestInfoEntries, factory.getSequenceCodec());
+     }
+   }
+   
    /**
     * Trains a name finder model.
     *
@@ -303,10 +368,16 @@ public class NameFinderME implements Tok
     * @return the newly trained model
     *
     * @throws IOException
+    * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory)} instead.
     */
+   @Deprecated
    public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
-       TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources,
-       SequenceCodec<String> seqCodec) throws IOException {
+       TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources)
+           throws IOException {
+     
+     if (languageCode == null) {
+       throw new IllegalArgumentException("languageCode must not be null!");
+     }
      
      String beamSizeString = trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
      
@@ -315,9 +386,6 @@ public class NameFinderME implements Tok
        beamSize = Integer.parseInt(beamSizeString);
      }
      
-     if (languageCode == null) {
-       throw new IllegalArgumentException("languageCode must not be null!");
-     }
      
      Map<String, String> manifestInfoEntries = new HashMap<String, String>();
 
@@ -336,7 +404,7 @@ public class NameFinderME implements Tok
      
      if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
        ObjectStream<Event> eventStream = new NameFinderEventStream(samples, type,
-           new DefaultNameContextGenerator(featureGenerator), seqCodec);
+           new DefaultNameContextGenerator(featureGenerator), new BioCodec());
 
        EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams.getSettings(), manifestInfoEntries);
        nameFinderModel = trainer.train(eventStream);
@@ -366,18 +434,13 @@ public class NameFinderME implements Tok
      // depending on which one is not null!
      if (seqModel != null) {
        return new TokenNameFinderModel(languageCode, seqModel, null,
-           resources, manifestInfoEntries, seqCodec);
+           resources, manifestInfoEntries, new BioCodec());
      }
      else {
        return new TokenNameFinderModel(languageCode, nameFinderModel, beamSize, null,
-           resources, manifestInfoEntries, seqCodec);
+           resources, manifestInfoEntries, new BioCodec());
      }
    }
-
-   public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
-       TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources) throws IOException {
-     return train(languageCode, type, samples, trainParams, generator, resources, new BioCodec());
-   }
    
   /**
    * Trains a name finder model.
@@ -398,14 +461,17 @@ public class NameFinderME implements Tok
    * @return the newly trained model
    *
    * @throws IOException
+   * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory)} instead.
    */
+   @Deprecated
   public static TokenNameFinderModel train(String languageCode, String type,
       ObjectStream<NameSample> samples, TrainingParameters trainParams,
-      byte[] featureGeneratorBytes, final Map<String, Object> resources, SequenceCodec<String> seqCodec)
+      byte[] featureGeneratorBytes, final Map<String, Object> resources,
+      TokenNameFinderFactory factory)
       throws IOException {
 
     TokenNameFinderModel model = train(languageCode, type, samples, trainParams,
-        createFeatureGenerator(featureGeneratorBytes, resources), resources, seqCodec);
+        createFeatureGenerator(featureGeneratorBytes, resources), resources);
 
     if (featureGeneratorBytes != null) {
       model = model.updateFeatureGenerator(featureGeneratorBytes);
@@ -414,21 +480,28 @@ public class NameFinderME implements Tok
     return model;
   }
 
+   /**
+    * 
+    * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory)} instead.
+    */
+   @Deprecated
   public static TokenNameFinderModel train(String languageCode, String type,
       ObjectStream<NameSample> samples, TrainingParameters trainParams,
       byte[] featureGeneratorBytes, final Map<String, Object> resources)
       throws IOException {
-    return train(languageCode, type, samples, trainParams, featureGeneratorBytes,
-        resources, new BioCodec());
+    return train(languageCode, type, samples, trainParams, featureGeneratorBytes, resources);
   }
 
+   /**
+    * @deprecated use {@link NameFinderME#train(String, String, ObjectStream, TrainingParameters, TokenNameFinderFactory)} instead.
+    */
+   @Deprecated
    public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
        final Map<String, Object> resources) throws IOException {
      return NameFinderME.train(languageCode, type, samples,
          ModelUtil.createDefaultTrainingParameters(), (byte[]) null, resources);
    }
 
-
   /**
    * Gets the name type from the outcome
    * @param outcome the outcome