You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/07/22 19:19:05 UTC
svn commit: r1149660 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/tokenizer/ tokenize/ tokenize/lang/
Author: colen
Date: Fri Jul 22 17:19:02 2011
New Revision: 1149660
URL: http://svn.apache.org/viewvc?rev=1149660&view=rev
Log:
OPENNLP-237 Adds abbreviation dictionary to Tokenizer. The Factory class was inspired in SentenceDetector component.
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerCrossValidatorTool.java Fri Jul 22 17:19:02 2011
@@ -27,6 +27,7 @@ import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerCrossValidator;
import opennlp.tools.util.ObjectStream;
@@ -73,17 +74,17 @@ public final class TokenizerCrossValidat
TokenizerCrossValidator validator;
+
+ if (mlParams == null)
+ mlParams = TokenizerTrainerTool.createTrainingParameters(
+ params.getIterations(), params.getCutoff());
- if (mlParams == null) {
- validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
- params.getLang(), params.getAlphaNumOpt(), params.getCutoff(),
- params.getIterations());
- } else {
- validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
- params.getLang(), params.getAlphaNumOpt(), mlParams);
- }
-
try {
+ Dictionary dict = TokenizerTrainerTool.loadDict(params.getAbbDict(), params.getIsAbbDictCS());
+
+ validator = new opennlp.tools.tokenize.TokenizerCrossValidator(
+ params.getLang(), dict, params.getAlphaNumOpt(), mlParams);
+
validator.evaluate(sampleStream, params.getFolds(), params.getMisclassified());
}
catch (IOException e) {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TokenizerTrainerTool.java Fri Jul 22 17:19:02 2011
@@ -29,11 +29,13 @@ import opennlp.tools.cmdline.CmdLineTool
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.TrainingToolParams;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
public final class TokenizerTrainerTool implements CmdLineTool {
@@ -65,6 +67,15 @@ public final class TokenizerTrainerTool
return new TokenSampleStream(lineStream);
}
+
+ static Dictionary loadDict(File f, boolean caseSensitive) throws IOException {
+ Dictionary dict = null;
+ if (f != null) {
+ CmdLineUtil.checkInputFile("abb dict", f);
+ dict = new Dictionary(new FileInputStream(f), caseSensitive);
+ }
+ return dict;
+ }
public void run(String[] args) {
if (!ArgumentParser.validateArguments(args, TrainerToolParams.class)) {
@@ -96,21 +107,15 @@ public final class TokenizerTrainerTool
CmdLineUtil.checkOutputFile("tokenizer model", modelOutFile);
ObjectStream<TokenSample> sampleStream = openSampleData("Training",
trainingDataInFile, params.getEncoding());
+
+ if(mlParams == null)
+ mlParams = createTrainingParameters(params.getIterations(), params.getCutoff());
TokenizerModel model;
try {
- if (mlParams == null) {
- model = opennlp.tools.tokenize.TokenizerME.train(
- params.getLang(), sampleStream,
- params.getAlphaNumOpt(),
- params.getCutoff(), params.getIterations());
- }
- else {
- model = opennlp.tools.tokenize.TokenizerME.train(
- params.getLang(), sampleStream,
- params.getAlphaNumOpt(),
- mlParams);
- }
+ Dictionary dict = loadDict(params.getAbbDict(), params.getIsAbbDictCS());
+ model = opennlp.tools.tokenize.TokenizerME.train(params.getLang(),
+ sampleStream, dict, params.getAlphaNumOpt(), mlParams);
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
@@ -125,4 +130,13 @@ public final class TokenizerTrainerTool
CmdLineUtil.writeModel("tokenizer", modelOutFile, model);
}
+
+ public static TrainingParameters createTrainingParameters(Integer iterations, Integer cutoff) {
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+ iterations.toString());
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, cutoff.toString());
+ return mlParams;
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/tokenizer/TrainingParams.java Fri Jul 22 17:19:02 2011
@@ -17,6 +17,8 @@
package opennlp.tools.cmdline.tokenizer;
+import java.io.File;
+
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.BasicTrainingParams;
@@ -30,4 +32,12 @@ interface TrainingParams extends BasicTr
@ParameterDescription(valueName = "isAlphaNumOpt", description = "Optimization flag to skip alpha numeric tokens for further tokenization")
@OptionalParameter(defaultValue = "false")
Boolean getAlphaNumOpt();
+
+ @ParameterDescription(valueName = "path", description = "The abbreviation dictionary in XML format.")
+ @OptionalParameter
+ File getAbbDict();
+
+ @ParameterDescription(valueName = "true|false", description = "True if the abbreviation dictionary is case sensitive. Default is true.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getIsAbbDictCS();
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/DefaultTokenContextGenerator.java Fri Jul 22 17:19:02 2011
@@ -19,7 +19,9 @@
package opennlp.tools.tokenize;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
+import java.util.Set;
import opennlp.tools.util.StringUtil;
@@ -27,14 +29,34 @@ import opennlp.tools.util.StringUtil;
* Generate events for maxent decisions for tokenization.
*/
public class DefaultTokenContextGenerator implements TokenContextGenerator {
+
+ private final Set<String> inducedAbbreviations;
+
+ /**
+ * Creates a default context generator for tokenizer.
+ */
+ public DefaultTokenContextGenerator() {
+ this(Collections.<String>emptySet());
+ }
+
+ /**
+ * Creates a default context generator for tokenizer.
+ *
+ * @param inducedAbbreviations the induced abbreviations
+ */
+ public DefaultTokenContextGenerator(Set<String> inducedAbbreviations) {
+ this.inducedAbbreviations = inducedAbbreviations;
+ }
/* (non-Javadoc)
* @see opennlp.tools.tokenize.TokenContextGenerator#getContext(java.lang.String, int)
*/
public String[] getContext(String sentence, int index) {
List<String> preds = new ArrayList<String>();
- preds.add("p=" + sentence.substring(0, index));
- preds.add("s=" + sentence.substring(index));
+ String prefix = sentence.substring(0, index);
+ String suffix = sentence.substring(index);
+ preds.add("p=" + prefix);
+ preds.add("s=" + suffix);
if (index > 0) {
addCharPreds("p1", sentence.charAt(index - 1), preds);
if (index > 1) {
@@ -60,6 +82,14 @@ public class DefaultTokenContextGenerato
if (sentence.charAt(0) == '&' && sentence.charAt(sentence.length() - 1) == ';') {
preds.add("cc");//character code
}
+
+ if(index == sentence.length() - 1 && inducedAbbreviations.contains(sentence)) {
+ preds.add("pabb");
+ }
+
+ if(inducedAbbreviations.contains(sentence)) {
+ preds.add("abb");
+ }
String[] context = new String[preds.size()];
preds.toArray(context);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokSpanEventStream.java Fri Jul 22 17:19:02 2011
@@ -23,8 +23,10 @@ import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.util.regex.Pattern;
import opennlp.model.Event;
+import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.AbstractEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
@@ -41,6 +43,23 @@ public class TokSpanEventStream extends
private TokenContextGenerator cg;
private boolean skipAlphaNumerics;
+
+ private final Pattern alphaNumeric;
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param tokenSamples
+ * @param skipAlphaNumerics
+ * @param cg
+ */
+ public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
+ boolean skipAlphaNumerics, Pattern alphaNumeric, TokenContextGenerator cg) {
+ super(tokenSamples);
+ this.alphaNumeric = alphaNumeric;
+ this.skipAlphaNumerics = skipAlphaNumerics;
+ this.cg = cg;
+ }
/**
* Initializes the current instance.
@@ -52,7 +71,8 @@ public class TokSpanEventStream extends
public TokSpanEventStream(ObjectStream<TokenSample> tokenSamples,
boolean skipAlphaNumerics, TokenContextGenerator cg) {
super(tokenSamples);
-
+ Factory factory = new Factory();
+ this.alphaNumeric = factory.getAlphanumeric(null);
this.skipAlphaNumerics = skipAlphaNumerics;
this.cg = cg;
}
@@ -99,7 +119,7 @@ public class TokSpanEventStream extends
cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
//should we skip this token
if (ctok.length() > 1
- && (!skipAlphaNumerics || !TokenizerME.alphaNumeric.matcher(ctok).matches())) {
+ && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {
//find offsets of annotated tokens inside of candidate tokens
boolean foundTrainingTokens = false;
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java Fri Jul 22 17:19:02 2011
@@ -19,6 +19,7 @@ package opennlp.tools.tokenize;
import java.io.IOException;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
@@ -31,32 +32,31 @@ public class TokenizerCrossValidator {
private final TrainingParameters params;
- private final int cutoff;
- private final int iterations;
+ private final Dictionary abbreviations;
private FMeasure fmeasure = new FMeasure();
public TokenizerCrossValidator(String language, boolean alphaNumericOptimization, int cutoff, int iterations) {
- this.language = language;
- this.alphaNumericOptimization = alphaNumericOptimization;
- this.cutoff = cutoff;
- this.iterations = iterations;
-
- params = null;
+ this(language, alphaNumericOptimization, createTrainingParameters(iterations, cutoff));
}
public TokenizerCrossValidator(String language, boolean alphaNumericOptimization) {
- this(language, alphaNumericOptimization, 5, 100);
+ this(language, alphaNumericOptimization, createTrainingParameters(100, 5));
}
public TokenizerCrossValidator(String language, boolean alphaNumericOptimization, TrainingParameters params) {
+ this(language, null, alphaNumericOptimization, params);
+ }
+
+ public TokenizerCrossValidator(String language, Dictionary abbreviations,
+ boolean alphaNumericOptimization, TrainingParameters params) {
+
this.language = language;
this.alphaNumericOptimization = alphaNumericOptimization;
- this.cutoff = -1;
- this.iterations = -1;
-
+ this.abbreviations = abbreviations;
this.params = params;
+
}
@@ -101,14 +101,8 @@ public class TokenizerCrossValidator {
// Maybe throws IOException if temporary file handling fails ...
TokenizerModel model;
- if (params == null) {
- model = TokenizerME.train(language, trainingSampleStream,
- alphaNumericOptimization, cutoff, iterations);
- }
- else {
- model = TokenizerME.train(language, trainingSampleStream,
- alphaNumericOptimization, params);
- }
+ model = TokenizerME.train(language, trainingSampleStream, abbreviations,
+ alphaNumericOptimization, params);
TokenizerEvaluator evaluator = new TokenizerEvaluator(new TokenizerME(model), printErrors);
evaluator.evaluate(trainingSampleStream.getTestSampleStream());
@@ -119,4 +113,14 @@ public class TokenizerCrossValidator {
public FMeasure getFMeasure() {
return fmeasure;
}
+
+ //TODO: this could go to a common util method, maybe inside TrainingParameters class
+ static TrainingParameters createTrainingParameters(int iterations, int cutoff) {
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+ Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+ return mlParams;
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java Fri Jul 22 17:19:02 2011
@@ -20,15 +20,19 @@ package opennlp.tools.tokenize;
import java.io.IOException;
import java.io.ObjectStreamException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.regex.Pattern;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.model.TrainUtil;
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
@@ -84,8 +88,11 @@ public class TokenizerME extends Abstrac
/**
* Alpha-Numeric Pattern
+ * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumericPattern(String)}
*/
- public static final Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$");
+ public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
+
+ private final Pattern alphanumeric;
/**
* The maximum entropy model to use to evaluate contexts.
@@ -95,7 +102,7 @@ public class TokenizerME extends Abstrac
/**
* The context generator.
*/
- private final TokenContextGenerator cg = new DefaultTokenContextGenerator();
+ private final TokenContextGenerator cg;
/**
* Optimization flag to skip alpha numeric tokens for further
@@ -112,12 +119,29 @@ public class TokenizerME extends Abstrac
private List<Span> newTokens;
public TokenizerME(TokenizerModel model) {
+ this(model, new Factory());
+ }
+
+ public TokenizerME(TokenizerModel model, Factory factory) {
+ String languageCode = model.getLanguage();
+
+ this.alphanumeric = factory.getAlphanumeric(languageCode);
+ this.cg = factory.createTokenContextGenerator(languageCode,
+ getAbbreviations(model.getAbbreviations()));
+
this.model = model.getMaxentModel();
useAlphaNumericOptimization = model.useAlphaNumericOptimization();
newTokens = new ArrayList<Span>();
tokProbs = new ArrayList<Double>(50);
}
+
+ private static Set<String> getAbbreviations(Dictionary abbreviations) {
+ if(abbreviations == null) {
+ return Collections.<String>emptySet();
+ }
+ return abbreviations.asStringSet();
+ }
/**
* Returns the probabilities associated with the most recent
@@ -154,7 +178,7 @@ public class TokenizerME extends Abstrac
newTokens.add(s);
tokProbs.add(1d);
}
- else if (useAlphaNumericOptimization() && alphaNumeric.matcher(tok).matches()) {
+ else if (useAlphaNumericOptimization() && alphanumeric.matcher(tok).matches()) {
newTokens.add(s);
tokProbs.add(1d);
}
@@ -185,17 +209,60 @@ public class TokenizerME extends Abstrac
return spans;
}
+ /**
+ * Trains a model for the {@link TokenizerME}.
+ *
+ * @param languageCode the language of the natural text
+ * @param samples the samples used for the training.
+ * @param useAlphaNumericOptimization - if true alpha numerics are skipped
+ * @param mlParams the machine learning train parameters
+ *
+ * @return the trained {@link TokenizerModel}
+ *
+ * @throws IOException it throws an {@link IOException} if an {@link IOException}
+ * is thrown during IO operations on a temp file which is created during training.
+ * Or if reading from the {@link ObjectStream} fails.
+ *
+ */
public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
+ return train(languageCode, samples, null, useAlphaNumericOptimization,
+ mlParams);
+ }
+
+ /**
+ * Trains a model for the {@link TokenizerME}.
+ *
+ * @param languageCode the language of the natural text
+ * @param samples the samples used for the training.
+ * @param abbreviations an abbreviations dictionary
+ * @param useAlphaNumericOptimization - if true alpha numerics are skipped
+ * @param mlParams the machine learning train parameters
+ *
+ * @return the trained {@link TokenizerModel}
+ *
+ * @throws IOException it throws an {@link IOException} if an {@link IOException}
+ * is thrown during IO operations on a temp file which is created during training.
+ * Or if reading from the {@link ObjectStream} fails.
+ *
+ */
+ public static TokenizerModel train(String languageCode,
+ ObjectStream<TokenSample> samples, Dictionary abbreviations,
+ boolean useAlphaNumericOptimization, TrainingParameters mlParams)
+ throws IOException {
+ Factory factory = new Factory();
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-
+
EventStream eventStream = new TokSpanEventStream(samples,
- useAlphaNumericOptimization);
+ useAlphaNumericOptimization, factory.getAlphanumeric(languageCode),
+ factory.createTokenContextGenerator(languageCode,
+ getAbbreviations(abbreviations)));
- AbstractModel maxentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);
-
- return new TokenizerModel(languageCode, maxentModel,
+ AbstractModel maxentModel = TrainUtil.train(eventStream,
+ mlParams.getSettings(), manifestInfoEntries);
+
+ return new TokenizerModel(languageCode, maxentModel, abbreviations,
useAlphaNumericOptimization, manifestInfoEntries);
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java?rev=1149660&r1=1149659&r2=1149660&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java Fri Jul 22 17:19:02 2011
@@ -29,6 +29,7 @@ import java.util.Map;
import opennlp.maxent.io.BinaryGISModelReader;
import opennlp.model.AbstractModel;
import opennlp.model.MaxentModel;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -44,6 +45,7 @@ public final class TokenizerModel extend
private static final String COMPONENT_NAME = "TokenizerME";
private static final String TOKENIZER_MODEL_ENTRY = "token.model";
+ private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
private static final String USE_ALPHA_NUMERIC_OPTIMIZATION =
"useAlphaNumericOptimization";
@@ -55,24 +57,44 @@ public final class TokenizerModel extend
* @param useAlphaNumericOptimization
*/
public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
- boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
+ Dictionary abbreviations, boolean useAlphaNumericOptimization,
+ Map<String, String> manifestInfoEntries) {
super(COMPONENT_NAME, language, manifestInfoEntries);
if (tokenizerMaxentModel == null)
- throw new IllegalArgumentException("tokenizerMaxentModel param must not bet null!");
+ throw new IllegalArgumentException(
+ "tokenizerMaxentModel param must not bet null!");
if (!isModelCompatible(tokenizerMaxentModel))
- throw new IllegalArgumentException("The maxent model is not compatible!");
+ throw new IllegalArgumentException("The maxent model is not compatible!");
artifactMap.put(TOKENIZER_MODEL_ENTRY, tokenizerMaxentModel);
setManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION,
Boolean.toString(useAlphaNumericOptimization));
+
+ // Abbreviations are optional
+ if (abbreviations != null)
+ artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviations);
+ }
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param language
+ * @param tokenizerMaxentModel
+ * @param useAlphaNumericOptimization
+ * @param manifestInfoEntries
+ */
+ public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
+ boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
+ this(language, tokenizerMaxentModel, null, useAlphaNumericOptimization, manifestInfoEntries);
}
/**
* Initializes the current instance.
*
+ * @param language
* @param tokenizerMaxentModel
* @param useAlphaNumericOptimization
*/
@@ -119,11 +141,21 @@ public final class TokenizerModel extend
throw new InvalidFormatException("The " + USE_ALPHA_NUMERIC_OPTIMIZATION + " parameter " +
"cannot be found!");
}
+
+ Object abbreviationsEntry = artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+
+ if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
+ throw new InvalidFormatException("Abbreviations dictionary has wrong type!");
+ }
}
public AbstractModel getMaxentModel() {
return (AbstractModel) artifactMap.get(TOKENIZER_MODEL_ENTRY);
}
+
+ public Dictionary getAbbreviations() {
+ return (Dictionary) artifactMap.get(ABBREVIATIONS_ENTRY_NAME);
+ }
public boolean useAlphaNumericOptimization() {
String optimization = getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION);
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java?rev=1149660&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java Fri Jul 22 17:19:02 2011
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize.lang;
+
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import opennlp.tools.tokenize.DefaultTokenContextGenerator;
+import opennlp.tools.tokenize.TokenContextGenerator;
+
+public class Factory {
+
+ public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$";
+
+ /**
+ * Gets the alpha numeric pattern for the language. Please save the value
+ * locally because this call is expensive.
+ *
+ * @param languageCode
+ * the language code. If null or unknow the default pattern will be
+ * returned.
+ * @return the alpha numeric pattern for the language or the default pattern.
+ */
+ public Pattern getAlphanumeric(String languageCode) {
+ if("pt".equals(languageCode)) {
+ return Pattern.compile("^[0-9a-záãâà éêÃóõôúüçA-ZÃÃÃÃÃÃÃÃÃÃÃÃÃ]+$");
+ }
+
+ return Pattern.compile(DEFAULT_ALPHANUMERIC);
+ }
+
+ public TokenContextGenerator createTokenContextGenerator(String languageCode, Set<String> abbreviations) {
+ return new DefaultTokenContextGenerator(abbreviations);
+ }
+
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain