You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/07/22 19:19:05 UTC

svn commit: r1149660 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/tokenizer/ tokenize/ tokenize/lang/

Author: colen
Date: Fri Jul 22 17:19:02 2011
New Revision: 1149660

URL: http://svn.apache.org/viewvc?rev=1149660&view=rev
Log:
OPENNLP-237 Adds abbreviation dictionary to Tokenizer. The Factory class was inspired in SentenceDetector component. 

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java Fri Jul 22 17:19:02 2011
@@ -27,6 +27,7 @@ import opennlp.tools.cmdline.CLI;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.tokenize.TokenizerCrossValidator;
 import opennlp.tools.util.ObjectStream;
@@ -73,17 +74,17 @@ public final class TokenizerCrossValidat
     
     
     TokenizerCrossValidator validator;
+    
+    if (mlParams == null)
+      mlParams = TokenizerTrainerTool.createTrainingParameters(
+          params.getIterations(), params.getCutoff());
 
-    if (mlParams == null) {
-      validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
-          params.getLang(), params.getAlphaNumOpt(), params.getCutoff(),
-          params.getIterations());
-    } else {
-      validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
-          params.getLang(), params.getAlphaNumOpt(), mlParams);
-    }
-      
     try {
+      Dictionary dict = TokenizerTrainerTool.loadDict(params.getAbbDict(), params.getIsAbbDictCS());
+
+      validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
+          params.getLang(), dict, params.getAlphaNumOpt(), mlParams);
+
       validator.evaluate(sampleStream, params.getFolds(), params.getMisclassified());
     }
     catch (IOException e) {

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java Fri Jul 22 17:19:02 2011
@@ -29,11 +29,13 @@ import opennlp.tools.cmdline.CmdLineTool
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.cmdline.TrainingToolParams;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.tokenize.TokenSampleStream;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
 
 public final class TokenizerTrainerTool implements CmdLineTool {
   
@@ -65,6 +67,15 @@ public final class TokenizerTrainerTool 
 
     return new TokenSampleStream(lineStream);
   }
+  
+  static Dictionary loadDict(File f, boolean caseSensitive) throws IOException {
+    Dictionary dict = null;
+    if (f != null) {
+      CmdLineUtil.checkInputFile("abb dict", f);
+      dict = new Dictionary(new FileInputStream(f), caseSensitive);
+    }
+    return dict;
+  }
 
   public void run(String[] args) {
     if (!ArgumentParser.validateArguments(args, TrainerToolParams.class)) {
@@ -96,21 +107,15 @@ public final class TokenizerTrainerTool 
     CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile);
     ObjectStream<TokenSample> sampleStream = openSampleData("Training",
         trainingDataInFile, params.getEncoding());
+    
+    if(mlParams == null) 
+      mlParams = createTrainingParameters(params.getIterations(), params.getCutoff());
 
     TokenizerModel model;
     try {
-      if (mlParams == null) {
-        model = opennlp.tools.tokenize.TokenizerME.train(
-            params.getLang(), sampleStream, 
-            params.getAlphaNumOpt(),
-            params.getCutoff(), params.getIterations());
-      }
-      else {
-        model = opennlp.tools.tokenize.TokenizerME.train(
-            params.getLang(), sampleStream, 
-            params.getAlphaNumOpt(),
-            mlParams);
-      }
+      Dictionary dict = loadDict(params.getAbbDict(), params.getIsAbbDictCS());
+      model = opennlp.tools.tokenize.TokenizerME.train(params.getLang(),
+          sampleStream, dict, params.getAlphaNumOpt(), mlParams);
     } catch (IOException e) {
       CmdLineUtil.printTrainingIoError(e);
       throw new TerminateToolException(-1);
@@ -125,4 +130,13 @@ public final class TokenizerTrainerTool 
 
     CmdLineUtil.writeModel("tokenizer", modelOutFile, model);
   }
+
+  public static TrainingParameters createTrainingParameters(Integer iterations, Integer cutoff) {
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+        iterations.toString());
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, cutoff.toString());
+    return mlParams;
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java Fri Jul 22 17:19:02 2011
@@ -17,6 +17,8 @@
 
 package opennlp.tools.cmdline.tokenizer;
 
+import java.io.File;
+
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.BasicTrainingParams;
@@ -30,4 +32,12 @@ interface TrainingParams extends BasicTr
   @ParameterDescription(valueName = "isAlphaNumOpt", description = "Optimization flag to skip alpha numeric tokens for further tokenization")
   @OptionalParameter(defaultValue = "false")
   Boolean getAlphaNumOpt();
+  
+  @ParameterDescription(valueName = "path", description = "The abbreviation dictionary in XML format.")
+  @OptionalParameter
+  File getAbbDict();
+
+  @ParameterDescription(valueName = "true|false", description = "True if the abbreviation dictionary is case sensitive. Default is true.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getIsAbbDictCS();
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java Fri Jul 22 17:19:02 2011
@@ -19,7 +19,9 @@
 package opennlp.tools.tokenize;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
+import java.util.Set;
 
 import opennlp.tools.util.StringUtil;
 
@@ -27,14 +29,34 @@ import opennlp.tools.util.StringUtil;
  * Generate events for maxent decisions for tokenization.
  */
 public class DefaultTokenContextGenerator implements TokenContextGenerator {
+  
+  private final Set<String> inducedAbbreviations;
+  
+  /**
+   * Creates a default context generator for tokenizer.
+   */
+  public DefaultTokenContextGenerator() {
+    this(Collections.<String>emptySet());
+  }
+  
+  /**
+   * Creates a default context generator for tokenizer.
+   * 
+   * @param inducedAbbreviations the induced abbreviations
+   */
+  public DefaultTokenContextGenerator(Set<String> inducedAbbreviations) {
+    this.inducedAbbreviations = inducedAbbreviations;
+  }
 
   /* (non-Javadoc)
    * @see opennlp.tools.tokenize.TokenContextGenerator#getContext(java.lang.String, int)
    */
   public String[] getContext(String sentence, int index) {
     List<String> preds = new ArrayList<String>();
-    preds.add("p=" + sentence.substring(0, index));
-    preds.add("s=" + sentence.substring(index));
+    String prefix = sentence.substring(0, index);
+    String suffix = sentence.substring(index);
+    preds.add("p=" + prefix);
+    preds.add("s=" + suffix);
     if (index > 0) {
       addCharPreds("p1", sentence.charAt(index - 1), preds);
       if (index > 1) {
@@ -60,6 +82,14 @@ public class DefaultTokenContextGenerato
     if (sentence.charAt(0) == '&' && sentence.charAt(sentence.length() - 1) == ';') {
       preds.add("cc");//character code
     }
+    
+    if(index == sentence.length() - 1 && inducedAbbreviations.contains(sentence)) {
+      preds.add("pabb");
+    }
+    
+    if(inducedAbbreviations.contains(sentence)) {
+      preds.add("abb");
+    }
 
     String[] context = new String[preds.size()];
     preds.toArray(context);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java Fri Jul 22 17:19:02 2011
@@ -23,8 +23,10 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.regex.Pattern;
 
 import opennlp.model.Event;
+import opennlp.tools.tokenize.lang.Factory;
 import opennlp.tools.util.AbstractEventStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
@@ -41,6 +43,23 @@ public class TokSpanEventStream extends 
   private TokenContextGenerator cg;
 
   private boolean skipAlphaNumerics;
+  
+  private final Pattern alphaNumeric;
+  
+  /**
+   * Initializes the current instance.
+   *
+   * @param tokenSamples
+   * @param skipAlphaNumerics
+   * @param cg
+   */
+  public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
+        boolean skipAlphaNumerics, Pattern alphaNumeric, TokenContextGenerator cg) {
+    super(tokenSamples);
+    this.alphaNumeric = alphaNumeric;
+    this.skipAlphaNumerics = skipAlphaNumerics;
+    this.cg = cg;
+  }
 
   /**
    * Initializes the current instance.
@@ -52,7 +71,8 @@ public class TokSpanEventStream extends 
   public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
         boolean skipAlphaNumerics, TokenContextGenerator cg) {
     super(tokenSamples);
-
+    Factory factory = new Factory();
+    this.alphaNumeric = factory.getAlphanumeric(null);
     this.skipAlphaNumerics = skipAlphaNumerics;
     this.cg = cg;
   }
@@ -99,7 +119,7 @@ public class TokSpanEventStream extends 
         cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
         //should we skip this token
         if (ctok.length() > 1
-          && (!skipAlphaNumerics || !TokenizerME.alphaNumeric.matcher(ctok).matches())) {
+          && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {
 
           //find offsets of annotated tokens inside of candidate tokens
           boolean foundTrainingTokens = false;

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java Fri Jul 22 17:19:02 2011
@@ -19,6 +19,7 @@ package opennlp.tools.tokenize;
 
 import java.io.IOException;
 
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.eval.CrossValidationPartitioner;
@@ -31,32 +32,31 @@ public class TokenizerCrossValidator {
   
   private final TrainingParameters params;
   
-  private final int cutoff;
-  private final int iterations;
+  private final Dictionary abbreviations;
   
   private FMeasure fmeasure = new FMeasure();
   
   
   public TokenizerCrossValidator(String language, boolean alphaNumericOptimization, int cutoff, int iterations) {
-    this.language = language;
-    this.alphaNumericOptimization = alphaNumericOptimization;
-    this.cutoff = cutoff;
-    this.iterations = iterations;
-    
-    params = null;
+    this(language, alphaNumericOptimization, createTrainingParameters(iterations, cutoff));
   }
   
   public TokenizerCrossValidator(String language, boolean alphaNumericOptimization) {
-    this(language, alphaNumericOptimization, 5, 100);
+    this(language, alphaNumericOptimization, createTrainingParameters(100, 5));
   }  
   
   public TokenizerCrossValidator(String language, boolean alphaNumericOptimization, TrainingParameters params) {
+    this(language, null, alphaNumericOptimization, params);
+  }
+  
+  public TokenizerCrossValidator(String language, Dictionary abbreviations,
+      boolean alphaNumericOptimization, TrainingParameters params) {
+    
     this.language = language;
     this.alphaNumericOptimization = alphaNumericOptimization;
-    this.cutoff = -1;
-    this.iterations = -1;
-    
+    this.abbreviations = abbreviations;
     this.params = params;
+    
   }
   
   
@@ -101,14 +101,8 @@ public class TokenizerCrossValidator {
        // Maybe throws IOException if temporary file handling fails ...
        TokenizerModel model;
        
-       if (params == null) {
-         model = TokenizerME.train(language, trainingSampleStream, 
-             alphaNumericOptimization, cutoff, iterations);
-       }
-       else {
-         model = TokenizerME.train(language, trainingSampleStream, 
-             alphaNumericOptimization, params);
-       }
+      model = TokenizerME.train(language, trainingSampleStream, abbreviations,
+          alphaNumericOptimization, params);
        
        TokenizerEvaluator evaluator = new TokenizerEvaluator(new TokenizerME(model), printErrors);
        evaluator.evaluate(trainingSampleStream.getTestSampleStream());
@@ -119,4 +113,14 @@ public class TokenizerCrossValidator {
   public FMeasure getFMeasure() {
     return fmeasure;
   }
+  
+  //TODO: this could go to a common util method, maybe inside TrainingParameters class
+  static TrainingParameters createTrainingParameters(int iterations, int cutoff) {
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+        Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    return mlParams;
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Fri Jul 22 17:19:02 2011
@@ -20,15 +20,19 @@ package opennlp.tools.tokenize;
 import java.io.IOException;
 import java.io.ObjectStreamException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.regex.Pattern;
 
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
 import opennlp.model.TrainUtil;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.lang.Factory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
@@ -84,8 +88,11 @@ public class TokenizerME extends Abstrac
 
   /**
    * Alpha-Numeric Pattern
+   * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumericPattern(String)} 
    */
-  public static final Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$");
+  public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
+  
+  private final Pattern alphanumeric;
 
   /**
    * The maximum entropy model to use to evaluate contexts.
@@ -95,7 +102,7 @@ public class TokenizerME extends Abstrac
   /**
    * The context generator.
    */
-  private final TokenContextGenerator cg = new DefaultTokenContextGenerator();
+  private final TokenContextGenerator cg;
 
   /**
    * Optimization flag to skip alpha numeric tokens for further
@@ -112,12 +119,29 @@ public class TokenizerME extends Abstrac
   private List<Span> newTokens;
 
   public TokenizerME(TokenizerModel model) {
+    this(model, new Factory());
+  }
+  
+  public TokenizerME(TokenizerModel model, Factory factory) {
+    String languageCode = model.getLanguage();
+
+    this.alphanumeric = factory.getAlphanumeric(languageCode);
+    this.cg = factory.createTokenContextGenerator(languageCode,
+        getAbbreviations(model.getAbbreviations()));
+
     this.model = model.getMaxentModel();
     useAlphaNumericOptimization = model.useAlphaNumericOptimization();
 
     newTokens = new ArrayList<Span>();
     tokProbs = new ArrayList<Double>(50);
   }
+  
+  private static Set<String> getAbbreviations(Dictionary abbreviations) {
+    if(abbreviations == null) {
+      return Collections.<String>emptySet();
+    }
+    return abbreviations.asStringSet();
+  }
 
   /**
    * Returns the probabilities associated with the most recent
@@ -154,7 +178,7 @@ public class TokenizerME extends Abstrac
         newTokens.add(s);
         tokProbs.add(1d);
       }
-      else if (useAlphaNumericOptimization() && alphaNumeric.matcher(tok).matches()) {
+      else if (useAlphaNumericOptimization() && alphanumeric.matcher(tok).matches()) {
         newTokens.add(s);
         tokProbs.add(1d);
       }
@@ -185,17 +209,60 @@ public class TokenizerME extends Abstrac
     return spans;
   }
 
+  /**
+   * Trains a model for the {@link TokenizerME}.
+   *
+   * @param languageCode the language of the natural text
+   * @param samples the samples used for the training.
+   * @param useAlphaNumericOptimization - if true alpha numerics are skipped
+   * @param mlParams the machine learning train parameters
+   * 
+   * @return the trained {@link TokenizerModel}
+   *
+   * @throws IOException it throws an {@link IOException} if an {@link IOException}
+   * is thrown during IO operations on a temp file which is created during training.
+   * Or if reading from the {@link ObjectStream} fails.
+   * 
+   */
   public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
       boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
+    return train(languageCode, samples, null, useAlphaNumericOptimization,
+        mlParams);
+  }
+  
+  /**
+   * Trains a model for the {@link TokenizerME}.
+   *
+   * @param languageCode the language of the natural text
+   * @param samples the samples used for the training.
+   * @param abbreviations an abbreviations dictionary
+   * @param useAlphaNumericOptimization - if true alpha numerics are skipped
+   * @param mlParams the machine learning train parameters
+   * 
+   * @return the trained {@link TokenizerModel}
+   *
+   * @throws IOException it throws an {@link IOException} if an {@link IOException}
+   * is thrown during IO operations on a temp file which is created during training.
+   * Or if reading from the {@link ObjectStream} fails.
+   * 
+   */
+  public static TokenizerModel train(String languageCode,
+      ObjectStream<TokenSample> samples, Dictionary abbreviations,
+      boolean useAlphaNumericOptimization, TrainingParameters mlParams)
+      throws IOException {
+    Factory factory = new Factory();
 
     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    
+
     EventStream eventStream = new TokSpanEventStream(samples,
-        useAlphaNumericOptimization);
+        useAlphaNumericOptimization, factory.getAlphanumeric(languageCode),
+        factory.createTokenContextGenerator(languageCode,
+            getAbbreviations(abbreviations)));
 
-    AbstractModel maxentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);
-    
-    return new TokenizerModel(languageCode, maxentModel, 
+    AbstractModel maxentModel = TrainUtil.train(eventStream,
+        mlParams.getSettings(), manifestInfoEntries);
+
+    return new TokenizerModel(languageCode, maxentModel, abbreviations,
         useAlphaNumericOptimization, manifestInfoEntries);
   }
   

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java Fri Jul 22 17:19:02 2011
@@ -29,6 +29,7 @@ import java.util.Map;
 import opennlp.maxent.io.BinaryGISModelReader;
 import opennlp.model.AbstractModel;
 import opennlp.model.MaxentModel;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
@@ -44,6 +45,7 @@ public final class TokenizerModel extend
   private static final String COMPONENT_NAME = "TokenizerME";
   
   private static final String TOKENIZER_MODEL_ENTRY = "token.model";
+  private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
 
   private static final String USE_ALPHA_NUMERIC_OPTIMIZATION =
       "useAlphaNumericOptimization";
@@ -55,24 +57,44 @@ public final class TokenizerModel extend
    * @param useAlphaNumericOptimization
    */
   public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
-      boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
+      Dictionary abbreviations, boolean useAlphaNumericOptimization,
+      Map<String, String> manifestInfoEntries) {
     super(COMPONENT_NAME, language, manifestInfoEntries);
 
     if (tokenizerMaxentModel == null)
-        throw new IllegalArgumentException("tokenizerMaxentModel param must not bet null!");
+      throw new IllegalArgumentException(
+          "tokenizerMaxentModel param must not bet null!");
 
     if (!isModelCompatible(tokenizerMaxentModel))
-        throw new IllegalArgumentException("The maxent model is not compatible!");
+      throw new IllegalArgumentException("The maxent model is not compatible!");
 
     artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerMaxentModel);
 
     setManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION,
         Boolean.toString(useAlphaNumericOptimization));
+
+    // Abbreviations are optional
+    if (abbreviations != null)
+      artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviations);
+  }
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param language
+   * @param tokenizerMaxentModel
+   * @param useAlphaNumericOptimization
+   * @param manifestInfoEntries
+   */
+  public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
+      boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
+    this(language, tokenizerMaxentModel, null, useAlphaNumericOptimization, manifestInfoEntries);
   }
 
   /**
    * Initializes the current instance.
    *
+   * @param language
    * @param tokenizerMaxentModel
    * @param useAlphaNumericOptimization
    */
@@ -119,11 +141,21 @@ public final class TokenizerModel extend
       throw new InvalidFormatException("The " + USE_ALPHA_NUMERIC_OPTIMIZATION + " parameter " +
           "cannot be found!");
     }
+    
+    Object abbreviationsEntry = artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+
+    if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
+      throw new InvalidFormatException("Abbreviations dictionary has wrong type!");
+    }
   }
 
   public AbstractModel getMaxentModel() {
     return (AbstractModel) artifactMap.get(TOKENIZER_MODEL_ENTRY);
   }
+  
+  public Dictionary getAbbreviations() {
+    return (Dictionary) artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+  }
 
   public boolean useAlphaNumericOptimization() {
     String optimization = getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION);

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java?rev=1149660&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java Fri Jul 22 17:19:02 2011
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize.lang;
+
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.tokenize.DefaultTokenContextGenerator;
+import opennlp.tools.tokenize.TokenContextGenerator;
+
+public class Factory {
+  
+  public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$";
+  
+  /**
+   * Gets the alpha numeric pattern for the language. Please save the value
+   * locally because this call is expensive.
+   * 
+   * @param languageCode
+   *          the language code. If null or unknow the default pattern will be
+   *          returned.
+   * @return the alpha numeric pattern for the language or the default pattern.
+   */
+  public Pattern getAlphanumeric(String languageCode) {
+    if("pt".equals(languageCode)) {
+      return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+    }
+    
+    return Pattern.compile(DEFAULT_ALPHANUMERIC);
+  }
+  
+  public TokenContextGenerator createTokenContextGenerator(String languageCode, Set<String> abbreviations) {
+    return new DefaultTokenContextGenerator(abbreviations);
+  }
+
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain