You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/11/16 12:58:50 UTC
svn commit: r1202649 - in /incubator/opennlp/trunk:
opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/
opennlp-uima/src/main/java/opennlp/uima/namefind/
opennlp-uima/src/main/java/opennlp/uima/util/
Author: joern
Date: Wed Nov 16 11:58:49 2011
New Revision: 1202649
URL: http://svn.apache.org/viewvc?rev=1202649&view=rev
Log:
OPENNLP-376 Added upport for feature generator definition file.
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java
incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/util/OpennlpUtil.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1202649&r1=1202648&r2=1202649&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Wed Nov 16 11:58:49 2011
@@ -40,6 +40,9 @@ import opennlp.tools.util.PlainTextByLin
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.ModelUtil;
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
public final class TokenNameFinderTrainerTool implements CmdLineTool {
interface TrainerToolParams extends TrainingParams, TrainingToolParams{
@@ -100,7 +103,7 @@ public final class TokenNameFinderTraine
return featureGeneratorBytes;
}
- static Map<String, Object> loadResources(File resourcePath) {
+ public static Map<String, Object> loadResources(File resourcePath) {
Map<String, Object> resources = new HashMap<String, Object>();
if (resourcePath != null) {
Modified: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java?rev=1202649&r1=1202648&r2=1202649&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java (original)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/namefind/NameFinderTrainer.java Wed Nov 16 11:58:49 2011
@@ -27,8 +27,10 @@ import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import opennlp.maxent.GIS;
+import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
@@ -70,6 +72,8 @@ import org.apache.uima.util.ProcessTrace
* Optional parameters
* <table border=1>
* <tr><th>Type</th> <th>Name</th> <th>Description</th></tr>
+ * <tr><td>String</td> <td>opennlp.uima.FeatureGeneratorFile</td> <td>Feature Generator definition file which contain the feature generator configuration</td></tr>
+ * <tr><td>String</td> <td>opennlp.uima.FeatureGeneratorResources</td> <td>Feature Generator resources dictionary</td></tr>
* <tr><td>String</td> <td>opennlp.uima.AdditionalTrainingDataFile</td> <td>Training file which contains additional data in the OpenNLP format</td></tr>
* <tr><td>String</td> <td>opennlp.uima.AdditionalTrainingDataEncoding</td> <td>Encoding of the additional training data</td></tr>
* <tr><td>Integer</td> <td>opennlp.uima.Cutoff</td> <td>(default=5)</td></tr>
@@ -79,10 +83,17 @@ import org.apache.uima.util.ProcessTrace
*/
public final class NameFinderTrainer extends CasConsumer_ImplBase {
+ private static final String FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER = "opennlp.uima.FeatureGeneratorFile";
+ private static final String FEATURE_GENERATOR_RESOURCES_PARAMETER = "opennlp.uima.FeatureGeneratorResources";
+
private Logger logger;
private String modelPath;
+ private byte featureGeneratorDefinition[];
+
+ private File featureGeneratorResourceDir;
+
private String additionalTrainingDataFile;
private String additionalTrainingDataEncoding;
@@ -129,6 +140,24 @@ public final class NameFinderTrainer ext
cutoff = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.CUTOFF_PARAMETER, 5);
iterations = CasConsumerUtil.getOptionalIntegerParameter(getUimaContext(), UimaUtil.ITERATIONS_PARAMETER, 100);
+ String featureGeneratorDefinitionFile = CasConsumerUtil.getOptionalStringParameter(
+ getUimaContext(), FEATURE_GENERATOR_DEFINITION_FILE_PARAMETER);
+
+ if (featureGeneratorDefinitionFile != null) {
+ try {
+ featureGeneratorDefinition = OpennlpUtil.loadBytes(new File(featureGeneratorDefinitionFile));
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+
+ String featureGeneratorResourcesDirName = CasConsumerUtil.getOptionalStringParameter(
+ getUimaContext(), FEATURE_GENERATOR_RESOURCES_PARAMETER);
+
+ if (featureGeneratorResourcesDirName != null) {
+ featureGeneratorResourceDir = new File(featureGeneratorResourcesDirName);
+ }
+ }
+
additionalTrainingDataFile = CasConsumerUtil.getOptionalStringParameter(
getUimaContext(), UimaUtil.ADDITIONAL_TRAINING_DATA_FILE);
@@ -214,12 +243,8 @@ public final class NameFinderTrainer ext
int startIndex = -1;
int index = 0;
- for (Iterator<AnnotationFS> tokenIterator = tokenList.iterator(); tokenIterator.hasNext();) {
- AnnotationFS token = (AnnotationFS) tokenIterator.next();
-
- for (Iterator<AnnotationFS> it = entityAnnotations.iterator(); it.hasNext();) {
-
- AnnotationFS entity = (AnnotationFS) it.next();
+ for (AnnotationFS token : tokenList) {
+ for (AnnotationFS entity : entityAnnotations) {
if (!isContaining(entity, token)) {
// ... end of an entity
@@ -281,14 +306,13 @@ public final class NameFinderTrainer ext
String tokenArray[] = new String[tokenList.size()];
for (int i = 0; i < tokenArray.length; i++) {
- tokenArray[i] = ((AnnotationFS) tokenList.get(i))
- .getCoveredText();
+ tokenArray[i] = tokenList.get(i).getCoveredText();
}
- NameSample traingSentence = new NameSample(tokenArray, names, null, false);
+ NameSample trainingSentence = new NameSample(tokenArray, names, null, false);
- if (traingSentence.getSentence().length != 0) {
- nameFinderSamples.add(traingSentence);
+ if (trainingSentence.getSentence().length != 0) {
+ nameFinderSamples.add(trainingSentence);
} else {
if (logger.isLoggable(Level.INFO)) {
logger.log(Level.INFO, "Sentence without tokens: " +
@@ -321,7 +345,7 @@ public final class NameFinderTrainer ext
if (additionalTrainingDataFile != null) {
if (logger.isLoggable(Level.INFO)) {
- logger.log(Level.INFO, "Using addional training data file: " + additionalTrainingDataFile);
+ logger.log(Level.INFO, "Using additional training data file: " + additionalTrainingDataFile);
}
additionalTrainingDataIn = new FileInputStream(additionalTrainingDataFile);
@@ -333,11 +357,18 @@ public final class NameFinderTrainer ext
samples = ObjectStreamUtils.createObjectStream(samples, additionalSamples);
}
- // TODO: Make sure its possible to pass custom feature generator
- // User could subclass this trainer to provide a custom feature generator
- nameModel = NameFinderME.train(language, null,
- samples, Collections.EMPTY_MAP, iterations, cutoff);
+ Map<String, Object> resourceMap;
+
+ if (featureGeneratorResourceDir != null) {
+ resourceMap = TokenNameFinderTrainerTool.loadResources(featureGeneratorResourceDir);
+ }
+ else {
+ resourceMap = Collections.emptyMap();
+ }
+
+ nameModel = NameFinderME.train(language, null,
+ samples, featureGeneratorDefinition, resourceMap, iterations, cutoff);
}
finally {
if (additionalTrainingDataIn != null)
Modified: incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/util/OpennlpUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/util/OpennlpUtil.java?rev=1202649&r1=1202648&r2=1202649&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/util/OpennlpUtil.java (original)
+++ incubator/opennlp/trunk/opennlp-uima/src/main/java/opennlp/uima/util/OpennlpUtil.java Wed Nov 16 11:58:49 2011
@@ -18,9 +18,12 @@
package opennlp.uima.util;
import java.io.BufferedOutputStream;
+import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
import opennlp.maxent.GISModel;
@@ -53,4 +56,26 @@ final public class OpennlpUtil {
modelOut.close();
}
}
+
+ public static final byte[] loadBytes(File inFile) throws IOException {
+ ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+
+ InputStream in = null;
+ try {
+ in = new FileInputStream(inFile);
+
+ byte buffer[] = new byte[1024];
+ int len;
+
+ while ((len = in.read(buffer)) > 0) {
+ bytes.write(buffer, 0, len);
+ }
+ }
+ finally {
+ if (in != null)
+ in.close();
+ }
+
+ return bytes.toByteArray();
+ }
}
\ No newline at end of file