You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/25 11:12:13 UTC
svn commit: r1127447 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind:
TokenNameFinderTrainerTool.java TrainingParameters.java
Author: joern
Date: Wed May 25 09:12:13 2011
New Revision: 1127447
URL: http://svn.apache.org/viewvc?rev=1127447&view=rev
Log:
OPENNLP-17 Added support for custom feature generator
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1127447&r1=1127446&r2=1127447&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Wed May 25 09:12:13 2011
@@ -20,8 +20,11 @@ package opennlp.tools.cmdline.namefind;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
import opennlp.model.TrainUtil;
import opennlp.tools.cmdline.CLI;
@@ -31,8 +34,11 @@ import opennlp.tools.cmdline.TerminateTo
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
public final class TokenNameFinderTrainerTool implements CmdLineTool {
@@ -82,6 +88,93 @@ public final class TokenNameFinderTraine
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
+
+ byte featureGeneratorBytes[] = null;
+
+ // load descriptor file into memory
+ if (parameters.getFeatureGenDescriptorFile() != null) {
+ InputStream bytesIn =
+ CmdLineUtil.openInFile(new File(parameters.getFeatureGenDescriptorFile()));
+
+ try {
+ featureGeneratorBytes = ModelUtil.read(bytesIn);
+ } catch (IOException e) {
+ CmdLineUtil.printTrainingIoError(e);
+ throw new TerminateToolException(-1);
+ }
+ finally {
+ try {
+ bytesIn.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+ }
+
+ // TODO: Support Custom resources:
+ // Must be loaded into memory, or written to tmp file until descriptor
+ // is loaded which defines parses when model is loaded
+
+ String resourceDirectory = parameters.getResourceDirectory();
+
+ Map<String, Object> resources = new HashMap<String, Object>();
+
+ if (resourceDirectory != null) {
+
+ Map<String, ArtifactSerializer> artifactSerializers =
+ TokenNameFinderModel.createArtifactSerializers();
+
+ File resourcePath = new File(resourceDirectory);
+
+ File resourceFiles[] = resourcePath.listFiles();
+
+ // TODO: Filter files, also files with start with a dot
+ for (File resourceFile : resourceFiles) {
+
+ // TODO: Move extension extracting code to method and
+ // write unit test for it
+
+ // extract file ending
+ String resourceName = resourceFile.getName();
+
+ int lastDot = resourceName.lastIndexOf('.');
+
+ if (lastDot == -1) {
+ continue;
+ }
+
+ String ending = resourceName.substring(lastDot + 1);
+
+ // lookup serializer from map
+ ArtifactSerializer serializer = artifactSerializers.get(ending);
+
+ // TODO: Do different? For now just ignore ....
+ if (serializer == null)
+ continue;
+
+ InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
+
+ try {
+ resources.put(resourceName, serializer.create(resoruceIn));
+ }
+ catch (InvalidFormatException e) {
+ // TODO: Fix exception handling
+ e.printStackTrace();
+ }
+ catch (IOException e) {
+ // TODO: Fix exception handling
+ e.printStackTrace();
+ }
+ finally {
+ try {
+ resoruceIn.close();
+ }
+ catch (IOException e) {
+ }
+ }
+ }
+ }
+
CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
parameters.getEncoding());
@@ -90,8 +183,8 @@ public final class TokenNameFinderTraine
try {
if (mlParams == null) {
model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(),
- sampleStream, Collections.<String, Object>emptyMap(),
- parameters.getNumberOfIterations(), parameters.getCutoff());
+ sampleStream, featureGeneratorBytes, resources, parameters.getNumberOfIterations(),
+ parameters.getCutoff());
}
else {
model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(), sampleStream, mlParams, null,
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java?rev=1127447&r1=1127446&r2=1127447&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java Wed May 25 09:12:13 2011
@@ -26,9 +26,13 @@ import opennlp.tools.cmdline.CmdLineUtil
class TrainingParameters extends BasicTrainingParameters {
private static final String TYPE_PARAM = "-type";
+ private static final String FEATURE_GEN_PARAM = "-featuregen";
private String type;
+ private String featureGeneratorDescription;
+ private String resourceDirectory;
+
TrainingParameters(String args[]) {
super(args);
@@ -36,14 +40,27 @@ class TrainingParameters extends BasicTr
if (type == null)
type = "default";
+
+ featureGeneratorDescription = CmdLineUtil.getParameter(FEATURE_GEN_PARAM, args);
+
+ resourceDirectory = CmdLineUtil.getParameter("-resources", args);
}
String getType() {
return type;
}
+ String getFeatureGenDescriptorFile() {
+ return featureGeneratorDescription;
+ }
+
+ // TODO: Add parameter to description
+ String getResourceDirectory() {
+ return resourceDirectory;
+ }
+
public static String getParameterUsage() {
- return BasicTrainingParameters.getParameterUsage() + " [" + TYPE_PARAM +" type]";
+ return BasicTrainingParameters.getParameterUsage() + " [" + TYPE_PARAM +" type]" + " [" + FEATURE_GEN_PARAM +" type]";
}
public static String getDescription() {