You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/09/11 14:27:48 UTC
svn commit: r1383378 - in /opennlp/trunk/opennlp-uima:
descriptors/TokenizerTrainer.xml
src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
Author: joern
Date: Tue Sep 11 12:27:48 2012
New Revision: 1383378
URL: http://svn.apache.org/viewvc?rev=1383378&view=rev
Log:
OPENNLP-535 Added sample trace file support to the tokenizer trainer.
Modified:
opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml
opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
Modified: opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml?rev=1383378&r1=1383377&r2=1383378&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml (original)
+++ opennlp/trunk/opennlp-uima/descriptors/TokenizerTrainer.xml Tue Sep 11 12:27:48 2012
@@ -61,6 +61,18 @@
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
+ <configurationParameter>
+ <name>opennlp.uima.SampleTraceFile</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>opennlp.uima.SampleTraceFileEncoding</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
Modified: opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java?rev=1383378&r1=1383377&r2=1383378&view=diff
==============================================================================
--- opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java (original)
+++ opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java Tue Sep 11 12:27:48 2012
@@ -19,9 +19,12 @@ package opennlp.uima.tokenize;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
@@ -30,7 +33,6 @@ import java.util.List;
import opennlp.maxent.GIS;
import opennlp.tools.namefind.NameSample;
-import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerME;
@@ -42,6 +44,7 @@ import opennlp.tools.util.Span;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
import opennlp.uima.util.OpennlpUtil;
+import opennlp.uima.util.SampleTraceStream;
import opennlp.uima.util.UimaUtil;
import org.apache.uima.UimaContext;
@@ -78,7 +81,7 @@ public final class TokenizerTrainer exte
public static final String IS_ALPHA_NUMERIC_OPTIMIZATION =
"opennlp.uima.tokenizer.IsAlphaNumericOptimization";
-
+
private List<TokenSample> tokenSamples = new ArrayList<TokenSample>();
private UimaContext mContext;
@@ -90,14 +93,18 @@ public final class TokenizerTrainer exte
private String mModelName;
private String additionalTrainingDataFile;
-
+
private String additionalTrainingDataEncoding;
-
+
private String language;
-
+
private Boolean isSkipAlphaNumerics;
-
+
private Logger mLogger;
+
+ private String sampleTraceFileEncoding;
+
+ private File sampleTraceFile;
/**
* Initializes the current instance.
@@ -124,8 +131,9 @@ public final class TokenizerTrainer exte
CasConsumerUtil.getOptionalBooleanParameter(
mContext, IS_ALPHA_NUMERIC_OPTIMIZATION);
- if (isSkipAlphaNumerics == null)
+ if (isSkipAlphaNumerics == null) {
isSkipAlphaNumerics = false;
+ }
additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
@@ -135,6 +143,16 @@ public final class TokenizerTrainer exte
additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(
getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
}
+
+ String sampleTraceFileName = CasConsumerUtil.getOptionalStringParameter(
+ getUimaContext(), "opennlp.uima.SampleTraceFile");
+
+ if (sampleTraceFileName != null) {
+ sampleTraceFile = new File(getUimaContextAdmin().getResourceManager()
+ .getDataPath() + File.separatorChar + sampleTraceFileName);
+ sampleTraceFileEncoding = CasConsumerUtil.getRequiredStringParameter(
+ getUimaContext(), "opennlp.uima.SampleTraceFileEncoding");
+ }
}
/**
@@ -208,7 +226,12 @@ public final class TokenizerTrainer exte
ObjectStream<TokenSample> samples = ObjectStreamUtils.createObjectStream(tokenSamples);
+ // Write stream to disk ...
+ // if trace file
+ // serialize events ...
+
InputStream additionalTrainingDataIn = null;
+ Writer samplesOut = null;
TokenizerModel tokenModel;
try {
@@ -226,6 +249,11 @@ public final class TokenizerTrainer exte
samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
}
+ if (sampleTraceFile != null) {
+ samplesOut = new OutputStreamWriter(new FileOutputStream(sampleTraceFile), sampleTraceFileEncoding);
+ samples = new SampleTraceStream<TokenSample>(samples, samplesOut);
+ }
+
tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics);
}
finally {
@@ -256,4 +284,4 @@ public final class TokenizerTrainer exte
// dereference to allow garbage collection
tokenSamples = null;
}
-}
\ No newline at end of file
+}