You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/07/21 22:32:19 UTC
svn commit: r1149347 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/sentdetect/ sentdetect/ sentdetect/lang/
Author: colen
Date: Thu Jul 21 20:32:17 2011
New Revision: 1149347
URL: http://svn.apache.org/viewvc?rev=1149347&view=rev
Log:
OPENNLP-225 Restored abbreviation dictionary in Sentence Detector using the current implementation of Dictionary.
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java Thu Jul 21 20:32:17 2011
@@ -27,6 +27,7 @@ import opennlp.tools.cmdline.CVParams;
import opennlp.tools.cmdline.CmdLineTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.SDCrossValidator;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
@@ -72,14 +73,17 @@ public final class SentenceDetectorCross
SDCrossValidator validator;
- if (mlParams == null) {
- validator = new SDCrossValidator(params.getLang(), params.getCutoff(), params.getIterations());
- }
- else {
- validator = new SDCrossValidator(params.getLang(), mlParams);
- }
-
try {
+ Dictionary abbreviations = SentenceDetectorTrainerTool.loadDict(
+ params.getAbbDict(), params.getIsAbbDictCS());
+ if (mlParams == null) {
+ validator = new SDCrossValidator(params.getLang(), params.getCutoff(),
+ params.getIterations(), abbreviations);
+ } else {
+ validator = new SDCrossValidator(params.getLang(), mlParams,
+ abbreviations);
+ }
+
validator.evaluate(sampleStream, params.getFolds(), params.getMisclassified());
}
catch (IOException e) {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java Thu Jul 21 20:32:17 2011
@@ -29,6 +29,7 @@ import opennlp.tools.cmdline.CmdLineTool
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.TrainingToolParams;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
@@ -67,6 +68,15 @@ public final class SentenceDetectorTrain
return new SentenceSampleStream(lineStream);
}
+ static Dictionary loadDict(File f, boolean caseSensitive) throws IOException {
+ Dictionary dict = null;
+ if (f != null) {
+ CmdLineUtil.checkInputFile("abb dict", f);
+ dict = new Dictionary(new FileInputStream(f), caseSensitive);
+ }
+ return dict;
+ }
+
public void run(String[] args) {
if (!ArgumentParser.validateArguments(args, TrainerToolParams.class)) {
System.err.println(getHelp());
@@ -96,12 +106,13 @@ public final class SentenceDetectorTrain
SentenceModel model;
try {
+ Dictionary dict = loadDict(params.getAbbDict(), params.getIsAbbDictCS());
if (mlParams == null) {
- model = SentenceDetectorME.train(params.getLang(), sampleStream, true, null,
+ model = SentenceDetectorME.train(params.getLang(), sampleStream, true, dict,
params.getCutoff(), params.getIterations());
}
else {
- model = SentenceDetectorME.train(params.getLang(), sampleStream, true, null,
+ model = SentenceDetectorME.train(params.getLang(), sampleStream, true, dict,
mlParams);
}
} catch (IOException e) {
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java Thu Jul 21 20:32:17 2011
@@ -17,6 +17,10 @@
package opennlp.tools.cmdline.sentdetect;
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.BasicTrainingParams;
/**
@@ -25,6 +29,13 @@ import opennlp.tools.cmdline.BasicTraini
* Note: Do not use this class, internal use only!
*/
interface TrainingParams extends BasicTrainingParams {
-
-
+
+ @ParameterDescription(valueName = "path", description = "The abbreviation dictionary in XML format.")
+ @OptionalParameter
+ File getAbbDict();
+
+ @ParameterDescription(valueName = "true|false", description = "True if the abbreviation dictionary is case sensitive. Default is true.")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getIsAbbDictCS();
+
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java Thu Jul 21 20:32:17 2011
@@ -19,6 +19,7 @@ package opennlp.tools.sentdetect;
import java.io.IOException;
+import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
@@ -31,27 +32,37 @@ public class SDCrossValidator {
private final String languageCode;
- private final int cutoff;
- private final int iterations;
+ private final Dictionary abbreviations;
private final TrainingParameters params;
private FMeasure fmeasure = new FMeasure();
public SDCrossValidator(String languageCode, int cutoff, int iterations) {
-
- this.languageCode = languageCode;
- this.cutoff = cutoff;
- this.iterations = iterations;
-
- params = null;
+ this(languageCode, createParams(cutoff, iterations));
}
public SDCrossValidator(String languageCode, TrainingParameters params) {
+ this(languageCode, params, null);
+ }
+
+ public SDCrossValidator(String languageCode, int cutoff, int iterations, Dictionary abbreviations) {
+ this(languageCode, createParams(cutoff, iterations), abbreviations);
+ }
+
+ public SDCrossValidator(String languageCode, TrainingParameters params, Dictionary abbreviations) {
this.languageCode = languageCode;
this.params = params;
- cutoff = -1;
- iterations = -1;
+ this.abbreviations = abbreviations;
+ }
+
+ private static TrainingParameters createParams(int cutoff, int iterations) {
+ TrainingParameters mlParams = new TrainingParameters();
+ mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+ mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+ Integer.toString(iterations));
+ mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+ return mlParams;
}
public SDCrossValidator(String languageCode) {
@@ -98,12 +109,8 @@ public class SDCrossValidator {
SentenceModel model;
- if (params == null) {
- model = SentenceDetectorME.train(languageCode, trainingSampleStream, true, null, cutoff, iterations);
- }
- else {
- model = SentenceDetectorME.train(languageCode, trainingSampleStream, true, null, params);
- }
+ model = SentenceDetectorME.train(languageCode, trainingSampleStream,
+ true, abbreviations, params);
// do testing
SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Thu Jul 21 20:32:17 2011
@@ -20,9 +20,11 @@ package opennlp.tools.sentdetect;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
@@ -88,11 +90,18 @@ public class SentenceDetectorME implemen
public SentenceDetectorME(SentenceModel model, Factory factory) {
this.model = model.getMaxentModel();
- cgen = factory.createSentenceContextGenerator(model.getLanguage());
+ cgen = factory.createSentenceContextGenerator(model.getLanguage(), getAbbreviations(model.getAbbreviations()));
scanner = factory.createEndOfSentenceScanner(model.getLanguage());
useTokenEnd = model.useTokenEnd();
}
+ private static Set<String> getAbbreviations(Dictionary abbreviations) {
+ if(abbreviations == null) {
+ return Collections.<String>emptySet();
+ }
+ return abbreviations.asStringSet();
+ }
+
/**
* Detect sentences in a String.
*
@@ -266,7 +275,7 @@ public class SentenceDetectorME implemen
// TODO: Fix the EventStream to throw exceptions when training goes wrong
EventStream eventStream = new SDEventStream(samples,
- factory.createSentenceContextGenerator(languageCode),
+ factory.createSentenceContextGenerator(languageCode, getAbbreviations(abbreviations)),
factory.createEndOfSentenceScanner(languageCode));
AbstractModel sentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java Thu Jul 21 20:32:17 2011
@@ -18,6 +18,9 @@
package opennlp.tools.sentdetect.lang;
+import java.util.Collections;
+import java.util.Set;
+
import opennlp.tools.sentdetect.DefaultEndOfSentenceScanner;
import opennlp.tools.sentdetect.DefaultSDContextGenerator;
import opennlp.tools.sentdetect.EndOfSentenceScanner;
@@ -34,12 +37,21 @@ public class Factory {
return new DefaultEndOfSentenceScanner(new char[]{'.', '!', '?'});
}
+ public SDContextGenerator createSentenceContextGenerator(String languageCode, Set<String> abbreviations) {
+
+ if ("th".equals(languageCode)) {
+ return new SentenceContextGenerator();
+ }
+
+ return new DefaultSDContextGenerator(abbreviations, new char[]{'.', '!', '?'});
+ }
+
public SDContextGenerator createSentenceContextGenerator(String languageCode) {
if ("th".equals(languageCode)) {
return new SentenceContextGenerator();
}
- return new DefaultSDContextGenerator(new char[]{'.', '!', '?'});
+ return new DefaultSDContextGenerator(Collections.<String>emptySet(), new char[]{'.', '!', '?'});
}
}
\ No newline at end of file