You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/09/11 14:27:48 UTC

svn commit: r1383378 - in /opennlp/trunk/opennlp-uima: descriptors/TokenizerTrainer.xml src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java

Author: joern
Date: Tue Sep 11 12:27:48 2012
New Revision: 1383378

URL: http://svn.apache.org/viewvc?rev=1383378&view=rev
Log:
OPENNLP-535 Added sample trace file support to the tokenizer trainer.

Modified:
    opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml
    opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java

Modified: opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml?rev=1383378&r1=1383377&r2=1383378&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml (original)
+++ opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml Tue Sep 11 12:27:48 2012
@@ -61,6 +61,18 @@
 				<multiValued>false</multiValued>
 				<mandatory>true</mandatory>
 			</configurationParameter>
+			<configurationParameter>
+				<name>opennlp.uima.SampleTraceFile</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>
+			<configurationParameter>
+				<name>opennlp.uima.SampleTraceFileEncoding</name>
+				<type>String</type>
+				<multiValued>false</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>			
 		</configurationParameters>
 		<configurationParameterSettings>
 			<nameValuePair>

Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java?rev=1383378&r1=1383377&r2=1383378&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java Tue Sep 11 12:27:48 2012
@@ -19,9 +19,12 @@ package opennlp.uima.tokenize;
 
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -30,7 +33,6 @@ import java.util.List;
 
 import opennlp.maxent.GIS;
 import opennlp.tools.namefind.NameSample;
-import opennlp.tools.namefind.NameSampleDataStream;
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.tokenize.TokenSampleStream;
 import opennlp.tools.tokenize.TokenizerME;
@@ -42,6 +44,7 @@ import opennlp.tools.util.Span;
 import opennlp.uima.util.CasConsumerUtil;
 import opennlp.uima.util.ContainingConstraint;
 import opennlp.uima.util.OpennlpUtil;
+import opennlp.uima.util.SampleTraceStream;
 import opennlp.uima.util.UimaUtil;
 
 import org.apache.uima.UimaContext;
@@ -78,7 +81,7 @@ public final class TokenizerTrainer exte
   
   public static final String IS_ALPHA_NUMERIC_OPTIMIZATION = 
       "opennlp.uima.tokenizer.IsAlphaNumericOptimization";
-	  
+
   private List<TokenSample> tokenSamples = new ArrayList<TokenSample>();
 
   private UimaContext mContext;
@@ -90,14 +93,18 @@ public final class TokenizerTrainer exte
   private String mModelName;
 
   private String additionalTrainingDataFile;
-  
+
   private String additionalTrainingDataEncoding;
-  
+
   private String language;
-  
+
   private Boolean isSkipAlphaNumerics;
-  
+
   private Logger mLogger;
+
+  private String sampleTraceFileEncoding;
+
+  private File sampleTraceFile;
   
   /**
    * Initializes the current instance.
@@ -124,8 +131,9 @@ public final class TokenizerTrainer exte
         CasConsumerUtil.getOptionalBooleanParameter(
         mContext, IS_ALPHA_NUMERIC_OPTIMIZATION);
     
-    if (isSkipAlphaNumerics == null)
+    if (isSkipAlphaNumerics == null) {
     	isSkipAlphaNumerics = false;
+    }
     
     additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
         getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
@@ -135,6 +143,16 @@ public final class TokenizerTrainer exte
       additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(
           getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
     }
+    
+    String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(
+            getUimaContext(), "opennlp.uima.SampleTraceFile");
+        
+    if (sampleTraceFileName != null) {
+      sampleTraceFile = new File(getUimaContextAdmin().getResourceManager()
+          .getDataPath() + File.separatorChar + sampleTraceFileName);
+      sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(
+          getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
+    }
   }
 
   /**
@@ -208,7 +226,12 @@ public final class TokenizerTrainer exte
    
     ObjectStream<TokenSample> samples = ObjectStreamUtils.createObjectStream(tokenSamples);
     
+    // Write stream to disk ...
+    // if trace file
+    // serialize events ...
+    
     InputStream additionalTrainingDataIn = null;
+    Writer samplesOut = null;
     TokenizerModel tokenModel;
     
     try {
@@ -226,6 +249,11 @@ public final class TokenizerTrainer exte
         samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
       }
       
+      if (sampleTraceFile != null) {
+        samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding);
+        samples = new SampleTraceStream<TokenSample>(samples, samplesOut);
+      }
+      
       tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics);
     }
     finally {
@@ -256,4 +284,4 @@ public final class TokenizerTrainer exte
     // dereference to allow garbage collection
     tokenSamples = null;
   }
-}
\ No newline at end of file
+}