You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/09/30 16:35:26 UTC
svn commit: r1177681 -
/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
Author: joern
Date: Fri Sep 30 14:35:26 2011
New Revision: 1177681
URL: http://svn.apache.org/viewvc?rev=1177681&view=rev
Log:
OPENNLP-307 Added support for an additional training data file.
Modified:
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
Modified: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java?rev=1177681&r1=1177680&r2=1177681&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java (original)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/tokenize/TokenizerTrainer.java Fri Sep 30 14:35:26 2011
@@ -18,7 +18,10 @@
package opennlp.uima.tokenize;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
@@ -26,10 +29,15 @@ import java.util.LinkedList;
import java.util.List;
import opennlp.maxent.GIS;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
+import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.uima.util.CasConsumerUtil;
import opennlp.uima.util.ContainingConstraint;
@@ -81,6 +89,10 @@ public final class TokenizerTrainer exte
private String mModelName;
+ private String additionalTrainingDataFile;
+
+ private String additionalTrainingDataEncoding;
+
private String language;
private Boolean isSkipAlphaNumerics;
@@ -114,6 +126,15 @@ public final class TokenizerTrainer exte
if (isSkipAlphaNumerics == null)
isSkipAlphaNumerics = false;
+
+ additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
+ getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
+
+ // If the additional training data is specified, the encoding must be provided!
+ if (additionalTrainingDataFile != null) {
+ additionalTrainingDataEncoding = CasConsumerUtil.getRequiredStringParameter(
+ getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_ENCODING);
+ }
}
/**
@@ -185,11 +206,40 @@ public final class TokenizerTrainer exte
public void collectionProcessComplete(ProcessTrace arg0)
throws ResourceProcessException, IOException {
+ if (mLogger.isLoggable(Level.INFO)) {
+ mLogger.log(Level.INFO, "Collected " + tokenSamples.size() +
+ " token samples.");
+ }
+
GIS.PRINT_MESSAGES = false;
- TokenizerModel tokenModel = TokenizerME.train(language,
- ObjectStreamUtils.createObjectStream(tokenSamples), isSkipAlphaNumerics);
-
+ ObjectStream<TokenSample> samples = ObjectStreamUtils.createObjectStream(tokenSamples);
+
+ InputStream additionalTrainingDataIn = null;
+ TokenizerModel tokenModel;
+
+ try {
+ if (additionalTrainingDataFile != null) {
+
+ if (mLogger.isLoggable(Level.INFO)) {
+ mLogger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile);
+ }
+
+ additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
+
+ ObjectStream<TokenSample> additionalSamples = new TokenSampleStream(
+ new PlainTextByLineStream(new InputStreamReader(additionalTrainingDataIn, additionalTrainingDataEncoding)));
+
+ samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
+ }
+
+ tokenModel = TokenizerME.train(language, samples, isSkipAlphaNumerics);
+ }
+ finally {
+ if (additionalTrainingDataIn != null)
+ additionalTrainingDataIn.close();
+ }
+
// dereference to allow garbage collection
tokenSamples = null;