You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/07/10 09:37:05 UTC
svn commit: r1359507 -
/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
Author: joern
Date: Tue Jul 10 07:37:05 2012
New Revision: 1359507
URL: http://svn.apache.org/viewvc?rev=1359507&view=rev
Log:
OPENNLP-517 Added end-of-sentence character configuration.
Modified:
opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java?rev=1359507&r1=1359506&r2=1359507&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java Tue Jul 10 07:37:05 2012
@@ -20,15 +20,17 @@ package opennlp.uima.sentdetect;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import opennlp.maxent.GIS;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.Span;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.OpennlpUtil;
import opennlp.uima.util.UimaUtil;
@@ -54,6 +56,7 @@ import org.apache.uima.util.ProcessTrace
* <tr><th>Type</th> <th>Name</th> <th>Description</th></tr>
* <tr><td>String</td> <td>opennlp.uima.ModelName</td> <td>The name of the model file</td></tr>
* <tr><td>String</td> <td>opennlp.uima.SentenceType</td> <td>The full name of the sentence type</td></tr>
+ * <tr><td>String</td> <td>opennlp.uima.EOSChars</td> <td>A string containing end-of-sentence characters</td></tr>
* </table>
*/
public final class SentenceDetectorTrainer extends CasConsumer_ImplBase {
@@ -70,6 +73,8 @@ public final class SentenceDetectorTrain
private UimaContext mContext;
+ private String eosChars;
+
/**
* Initializes the current instance.
*/
@@ -91,6 +96,8 @@ public final class SentenceDetectorTrain
language = CasConsumerUtil.getRequiredStringParameter(mContext,
UimaUtil.LANGUAGE_PARAMETER);
+
+ eosChars = CasConsumerUtil.getOptionalStringParameter(mContext, "opennlp.uima.EOSChars");
}
/**
@@ -130,9 +137,19 @@ public final class SentenceDetectorTrain
public void collectionProcessComplete(ProcessTrace trace)
throws ResourceProcessException, IOException {
GIS.PRINT_MESSAGES = false;
-
- SentenceModel sentenceModel = SentenceDetectorME.train(language,
- ObjectStreamUtils.createObjectStream(sentenceSamples), true, null);
+
+ char eos[] = null;
+ if (eosChars != null) {
+ eos = eosChars.toCharArray();
+ }
+
+ SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
+ null, language, true, null, eos);
+
+ TrainingParameters mlParams = ModelUtil.createTrainingParameters(100, 5);
+
+ SentenceModel sentenceModel = SentenceDetectorME.train(language, ObjectStreamUtils.createObjectStream(sentenceSamples),
+ sdFactory, mlParams);
// dereference to allow garbage collection
sentenceSamples = null;
@@ -157,4 +174,4 @@ public final class SentenceDetectorTrain
// dereference to allow garbage collection
sentenceSamples = null;
}
-}
\ No newline at end of file
+}