You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/09/11 16:02:06 UTC

svn commit: r1383421 - in /opennlp/trunk/opennlp-uima: descriptors/SentenceDetectorTrainer.xml src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java

Author: joern
Date: Tue Sep 11 14:02:05 2012
New Revision: 1383421

URL: http://svn.apache.org/viewvc?rev=1383421&view=rev
Log:
OPENNLP-536 Added sample trace file support to the sentence detector trainer.

Modified:
    opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml
    opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java

Modified: opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml?rev=1383421&r1=1383420&r2=1383421&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml (original)
+++ opennlp/trunk/opennlp-uima/descriptors/SentenceDetectorTrainer.xml Tue Sep 11 14:02:05 2012
@@ -57,6 +57,18 @@
 				<multiValued>false</multiValued>
 				<mandatory>false</mandatory>
 			</configurationParameter>
+			<configurationParameter>
+				<name>opennlp.uima.SampleTraceFile</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>
+			<configurationParameter>
+				<name>opennlp.uima.SampleTraceFileEncoding</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>			
 		</configurationParameters>
 
 		<configurationParameterSettings>

Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java?rev=1383421&r1=1383420&r2=1383421&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/sentdetect/SentenceDetectorTrainer.java Tue Sep 11 14:02:05 2012
@@ -18,7 +18,10 @@
 package opennlp.uima.sentdetect;
 
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -27,12 +30,14 @@ import opennlp.tools.sentdetect.Sentence
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelUtil;
 import opennlp.uima.util.CasConsumerUtil;
 import opennlp.uima.util.OpennlpUtil;
+import opennlp.uima.util.SampleTraceStream;
 import opennlp.uima.util.UimaUtil;
 
 import org.apache.uima.UimaContext;
@@ -74,6 +79,10 @@ public final class SentenceDetectorTrain
   private UimaContext mContext;
   
   private String eosChars;
+
+  private File sampleTraceFile;
+
+  private String sampleTraceFileEncoding;
   
   /**
    * Initializes the current instance.
@@ -98,6 +107,17 @@ public final class SentenceDetectorTrain
         UimaUtil.LANGUAGE_PARAMETER);
     
     eosChars = CasConsumerUtil.getOptionalStringParameter(mContext, "opennlp.uima.EOSChars");
+    
+    
+    String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(
+            getUimaContext(), "opennlp.uima.SampleTraceFile");
+        
+    if (sampleTraceFileName != null) {
+      sampleTraceFile = new File(getUimaContextAdmin().getResourceManager()
+          .getDataPath() + File.separatorChar + sampleTraceFileName);
+      sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(
+          getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
+    }    
   }
   
   /**
@@ -127,7 +147,8 @@ public final class SentenceDetectorTrain
       sentSpans[i++] = new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd());
     }
 
-    sentenceSamples.add(new SentenceSample(cas.getDocumentText(), sentSpans));
+    // TODO: The line cleaning should be done more carefully
+    sentenceSamples.add(new SentenceSample(cas.getDocumentText().replace('\n', ' '), sentSpans));
   }
 
   /**
@@ -148,7 +169,16 @@ public final class SentenceDetectorTrain
     
     TrainingParameters mlParams = ModelUtil.createTrainingParameters(100, 5);
     
-    SentenceModel sentenceModel = SentenceDetectorME.train(language, ObjectStreamUtils.createObjectStream(sentenceSamples),
+    ObjectStream<SentenceSample> samples = ObjectStreamUtils.createObjectStream(sentenceSamples);
+    
+    Writer samplesOut = null;
+    
+    if (sampleTraceFile != null) {
+        samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding);
+        samples = new SampleTraceStream<SentenceSample>(samples, samplesOut);
+    }
+    
+    SentenceModel sentenceModel = SentenceDetectorME.train(language, samples,
          sdFactory, mlParams);
     
     // dereference to allow garbage collection