You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/07/10 09:37:05 UTC

svn commit: r1359507 - /opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java

Author: joern
Date: Tue Jul 10 07:37:05 2012
New Revision: 1359507

URL: http://svn.apache.org/viewvc?rev=1359507&view=rev
Log:
OPENNLP-517 Added end-of-sentence character configuration.

Modified:
    opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java

Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java?rev=1359507&r1=1359506&r2=1359507&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java Tue Jul 10 07:37:05 2012
@@ -20,15 +20,17 @@ package opennlp.uima.sentdetect;
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Iterator;
 import java.util.List;
 
 import opennlp.maxent.GIS;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.sentdetect.SentenceSample;
 import opennlp.tools.util.ObjectStreamUtils;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
 import opennlp.uima.util.CasConsumerUtil;
 import opennlp.uima.util.OpennlpUtil;
 import opennlp.uima.util.UimaUtil;
@@ -54,6 +56,7 @@ import org.apache.uima.util.ProcessTrace
  *   <tr><th>Type</th> <th>Name</th> <th>Description</th></tr>
  *   <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr>
  *   <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr>
+ *   <tr><td>String</td> <td>opennlp.uima.EOSChars</td> <td>A string containing end-of-sentence characters</td></tr>
  * </table>
  */
 public final class SentenceDetectorTrainer extends CasConsumer_ImplBase {
@@ -70,6 +73,8 @@ public final class SentenceDetectorTrain
 
   private UimaContext mContext;
   
+  private String eosChars;
+  
   /**
    * Initializes the current instance.
    */
@@ -91,6 +96,8 @@ public final class SentenceDetectorTrain
     
     language = CasConsumerUtil.getRequiredStringParameter(mContext,
         UimaUtil.LANGUAGE_PARAMETER);
+    
+    eosChars = CasConsumerUtil.getOptionalStringParameter(mContext, "opennlp.uima.EOSChars");
   }
   
   /**
@@ -130,9 +137,19 @@ public final class SentenceDetectorTrain
   public void collectionProcessComplete(ProcessTrace trace)
       throws ResourceProcessException, IOException {
     GIS.PRINT_MESSAGES = false;
-
-    SentenceModel sentenceModel = SentenceDetectorME.train(language,
-        ObjectStreamUtils.createObjectStream(sentenceSamples), true, null);
+    
+    char eos[] = null; 
+    if (eosChars != null) {
+      eos = eosChars.toCharArray();
+    }
+    
+    SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
+            null, language, true, null, eos);
+    
+    TrainingParameters mlParams = ModelUtil.createTrainingParameters(100, 5);
+    
+    SentenceModel sentenceModel = SentenceDetectorME.train(language, ObjectStreamUtils.createObjectStream(sentenceSamples),
+         sdFactory, mlParams);
     
     // dereference to allow garbage collection
     sentenceSamples = null;
@@ -157,4 +174,4 @@ public final class SentenceDetectorTrain
     // dereference to allow garbage collection
     sentenceSamples = null;
   }
-}
\ No newline at end of file
+}