You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/09/11 16:02:06 UTC
svn commit: r1383421 - in /opennlp/trunk/opennlp-uima:
descriptors/SentenceDetectorTrainer.xml
src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
Author: joern
Date: Tue Sep 11 14:02:05 2012
New Revision: 1383421
URL: http://svn.apache.org/viewvc?rev=1383421&view=rev
Log:
OPENNLP-536 Added sample trace file support to the sentence detector trainer.
Modified:
opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml
opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
Modified: opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml?rev=1383421&r1=1383420&r2=1383421&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml (original)
+++ opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml Tue Sep 11 14:02:05 2012
@@ -57,6 +57,18 @@
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
+ <configurationParameter>
+ <name>opennlp.uima.SampleTraceFile</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>opennlp.uima.SampleTraceFileEncoding</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
</configurationParameters>
<configurationParameterSettings>
Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java?rev=1383421&r1=1383420&r2=1383421&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java Tue Sep 11 14:02:05 2012
@@ -18,7 +18,10 @@
package opennlp.uima.sentdetect;
import java.io.File;
+import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
@@ -27,12 +30,14 @@ import opennlp.tools.sentdetect.Sentence
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.OpennlpUtil;
+import opennlp.uima.util.SampleTraceStream;
import opennlp.uima.util.UimaUtil;
import org.apache.uima.UimaContext;
@@ -74,6 +79,10 @@ public final class SentenceDetectorTrain
private UimaContext mContext;
private String eosChars;
+
+ private File sampleTraceFile;
+
+ private String sampleTraceFileEncoding;
/**
* Initializes the current instance.
@@ -98,6 +107,17 @@ public final class SentenceDetectorTrain
UimaUtil.LANGUAGE_PARAMETER);
eosChars = CasConsumerUtil.getOptionalStringParameter(mContext, "opennlp.uima.EOSChars");
+
+
+ String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(
+ getUimaContext(), "opennlp.uima.SampleTraceFile");
+
+ if (sampleTraceFileName != null) {
+ sampleTraceFile = new File(getUimaContextAdmin().getResourceManager()
+ .getDataPath() + File.separatorChar + sampleTraceFileName);
+ sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(
+ getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
+ }
}
/**
@@ -127,7 +147,8 @@ public final class SentenceDetectorTrain
sentSpans[i++] = new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd());
}
- sentenceSamples.add(new SentenceSample(cas.getDocumentText(), sentSpans));
+ // TODO: The line cleaning should be done more carefully
+ sentenceSamples.add(new SentenceSample(cas.getDocumentText().replace('\n', ' '), sentSpans));
}
/**
@@ -148,7 +169,16 @@ public final class SentenceDetectorTrain
TrainingParameters mlParams = ModelUtil.createTrainingParameters(100, 5);
- SentenceModel sentenceModel = SentenceDetectorME.train(language, ObjectStreamUtils.createObjectStream(sentenceSamples),
+ ObjectStream<SentenceSample> samples = ObjectStreamUtils.createObjectStream(sentenceSamples);
+
+ Writer samplesOut = null;
+
+ if (sampleTraceFile != null) {
+ samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding);
+ samples = new SampleTraceStream<SentenceSample>(samples, samplesOut);
+ }
+
+ SentenceModel sentenceModel = SentenceDetectorME.train(language, samples,
sdFactory, mlParams);
// dereference to allow garbage collection