You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/07/21 22:32:19 UTC

svn commit: r1149347 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/sentdetect/ sentdetect/ sentdetect/lang/

Author: colen
Date: Thu Jul 21 20:32:17 2011
New Revision: 1149347

URL: http://svn.apache.org/viewvc?rev=1149347&view=rev
Log:
OPENNLP-225 Restored abbreviation dictionary in Sentence Detector using the current implementation of Dictionary.

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java Thu Jul 21 20:32:17 2011
@@ -27,6 +27,7 @@ import opennlp.tools.cmdline.CVParams;
 import opennlp.tools.cmdline.CmdLineTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.SDCrossValidator;
 import opennlp.tools.sentdetect.SentenceSample;
 import opennlp.tools.util.ObjectStream;
@@ -72,14 +73,17 @@ public final class SentenceDetectorCross
     
     SDCrossValidator validator;
 
-    if (mlParams == null) {
-      validator = new SDCrossValidator(params.getLang(), params.getCutoff(), params.getIterations());
-    }
-    else {
-      validator = new SDCrossValidator(params.getLang(), mlParams);
-    }
-    
     try {
+      Dictionary abbreviations = SentenceDetectorTrainerTool.loadDict(
+          params.getAbbDict(), params.getIsAbbDictCS());
+      if (mlParams == null) {
+        validator = new SDCrossValidator(params.getLang(), params.getCutoff(),
+            params.getIterations(), abbreviations);
+      } else {
+        validator = new SDCrossValidator(params.getLang(), mlParams,
+            abbreviations);
+      }
+      
       validator.evaluate(sampleStream, params.getFolds(), params.getMisclassified());
     }
     catch (IOException e) {

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java Thu Jul 21 20:32:17 2011
@@ -29,6 +29,7 @@ import opennlp.tools.cmdline.CmdLineTool
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.cmdline.TrainingToolParams;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.sentdetect.SentenceSample;
@@ -67,6 +68,15 @@ public final class SentenceDetectorTrain
     return new SentenceSampleStream(lineStream);
   }
   
+  static Dictionary loadDict(File f, boolean caseSensitive) throws IOException {
+    Dictionary dict = null;
+    if (f != null) {
+      CmdLineUtil.checkInputFile("abb dict", f);
+      dict = new Dictionary(new FileInputStream(f), caseSensitive);
+    }
+    return dict;
+  }
+  
   public void run(String[] args) {
     if (!ArgumentParser.validateArguments(args, TrainerToolParams.class)) {
       System.err.println(getHelp());
@@ -96,12 +106,13 @@ public final class SentenceDetectorTrain
 
     SentenceModel model;
     try {
+      Dictionary dict = loadDict(params.getAbbDict(), params.getIsAbbDictCS());
       if (mlParams == null) {
-        model = SentenceDetectorME.train(params.getLang(), sampleStream, true, null, 
+        model = SentenceDetectorME.train(params.getLang(), sampleStream, true, dict, 
             params.getCutoff(), params.getIterations());
       }
       else {
-        model = SentenceDetectorME.train(params.getLang(), sampleStream, true, null, 
+        model = SentenceDetectorME.train(params.getLang(), sampleStream, true, dict, 
             mlParams);
       }
     } catch (IOException e) {

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java Thu Jul 21 20:32:17 2011
@@ -17,6 +17,10 @@
 
 package opennlp.tools.cmdline.sentdetect;
 
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.BasicTrainingParams;
 
 /**
@@ -25,6 +29,13 @@ import opennlp.tools.cmdline.BasicTraini
  * Note: Do not use this class, internal use only!
  */
 interface TrainingParams extends BasicTrainingParams {
-  
-  
+
+  @ParameterDescription(valueName = "path", description = "The abbreviation dictionary in XML format.")
+  @OptionalParameter
+  File getAbbDict();
+
+  @ParameterDescription(valueName = "true|false", description = "True if the abbreviation dictionary is case sensitive. Default is true.")
+  @OptionalParameter(defaultValue = "true")
+  Boolean getIsAbbDictCS();
+
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java Thu Jul 21 20:32:17 2011
@@ -19,6 +19,7 @@ package opennlp.tools.sentdetect;
 
 import java.io.IOException;
 
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.eval.CrossValidationPartitioner;
@@ -31,27 +32,37 @@ public class SDCrossValidator {
   
   private final String languageCode;
   
-  private final int cutoff;
-  private final int iterations;
+  private final Dictionary abbreviations;
   
   private final TrainingParameters params;
   
   private FMeasure fmeasure = new FMeasure();
   
   public SDCrossValidator(String languageCode, int cutoff, int iterations) {
-    
-    this.languageCode = languageCode;
-    this.cutoff = cutoff;
-    this.iterations = iterations;
-    
-    params = null;
+    this(languageCode, createParams(cutoff, iterations));
   }
   
   public SDCrossValidator(String languageCode, TrainingParameters params) {
+    this(languageCode, params, null);
+  }
+  
+  public SDCrossValidator(String languageCode, int cutoff, int iterations, Dictionary abbreviations) {
+    this(languageCode, createParams(cutoff, iterations), abbreviations);
+  }
+  
+  public SDCrossValidator(String languageCode, TrainingParameters params, Dictionary abbreviations) {
     this.languageCode = languageCode;
     this.params = params;
-    cutoff = -1;
-    iterations = -1;
+    this.abbreviations = abbreviations;
+  }
+  
+  private static TrainingParameters createParams(int cutoff, int iterations) {
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM,
+        Integer.toString(iterations));
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    return mlParams;
   }
   
   public SDCrossValidator(String languageCode) {
@@ -98,12 +109,8 @@ public class SDCrossValidator {
      
       SentenceModel model; 
       
-      if (params == null) {
-        model = SentenceDetectorME.train(languageCode, trainingSampleStream, true, null, cutoff, iterations);
-      }
-      else {
-        model = SentenceDetectorME.train(languageCode, trainingSampleStream, true, null, params);
-      }
+      model = SentenceDetectorME.train(languageCode, trainingSampleStream,
+          true, abbreviations, params);
       
       // do testing
       SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java Thu Jul 21 20:32:17 2011
@@ -20,9 +20,11 @@ package opennlp.tools.sentdetect;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
@@ -88,11 +90,18 @@ public class SentenceDetectorME implemen
 
   public SentenceDetectorME(SentenceModel model, Factory factory) {
     this.model = model.getMaxentModel();
-    cgen = factory.createSentenceContextGenerator(model.getLanguage());
+    cgen = factory.createSentenceContextGenerator(model.getLanguage(), getAbbreviations(model.getAbbreviations()));
     scanner = factory.createEndOfSentenceScanner(model.getLanguage());
     useTokenEnd = model.useTokenEnd();
   }
 
+  private static Set<String> getAbbreviations(Dictionary abbreviations) {
+    if(abbreviations == null) {
+      return Collections.<String>emptySet();
+    }
+    return abbreviations.asStringSet();
+  }
+
   /**
    * Detect sentences in a String.
    *
@@ -266,7 +275,7 @@ public class SentenceDetectorME implemen
     
     // TODO: Fix the EventStream to throw exceptions when training goes wrong
     EventStream eventStream = new SDEventStream(samples,
-        factory.createSentenceContextGenerator(languageCode),
+        factory.createSentenceContextGenerator(languageCode, getAbbreviations(abbreviations)),
         factory.createEndOfSentenceScanner(languageCode));
     
     AbstractModel sentModel = TrainUtil.train(eventStream, mlParams.getSettings(), manifestInfoEntries);

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java?rev=1149347&r1=1149346&r2=1149347&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java Thu Jul 21 20:32:17 2011
@@ -18,6 +18,9 @@
 
 package opennlp.tools.sentdetect.lang;
 
+import java.util.Collections;
+import java.util.Set;
+
 import opennlp.tools.sentdetect.DefaultEndOfSentenceScanner;
 import opennlp.tools.sentdetect.DefaultSDContextGenerator;
 import opennlp.tools.sentdetect.EndOfSentenceScanner;
@@ -34,12 +37,21 @@ public class Factory {
     return new DefaultEndOfSentenceScanner(new char[]{'.', '!', '?'});
   }
 
+  public SDContextGenerator createSentenceContextGenerator(String languageCode, Set<String> abbreviations) {
+
+    if ("th".equals(languageCode)) {
+      return new SentenceContextGenerator();
+    }
+
+    return new DefaultSDContextGenerator(abbreviations, new char[]{'.', '!', '?'});
+  }
+  
   public SDContextGenerator createSentenceContextGenerator(String languageCode) {
 
     if ("th".equals(languageCode)) {
       return new SentenceContextGenerator();
     }
 
-    return new DefaultSDContextGenerator(new char[]{'.', '!', '?'});
+    return new DefaultSDContextGenerator(Collections.<String>emptySet(), new char[]{'.', '!', '?'});
   }
 }
\ No newline at end of file