You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/06/03 08:38:13 UTC
svn commit: r1130913 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/ cmdline/namefind/ namefind/
Author: colen
Date: Fri Jun 3 06:38:13 2011
New Revision: 1130913
URL: http://svn.apache.org/viewvc?rev=1130913&view=rev
Log:
OPENNLP-178 Added cross validation cmd line tool for the name finder
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java Fri Jun 3 06:38:13 2011
@@ -35,6 +35,7 @@ import opennlp.tools.cmdline.doccat.Docc
import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
import opennlp.tools.cmdline.namefind.CensusDictionaryCreatorTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderConverterTool;
+import opennlp.tools.cmdline.namefind.TokenNameFinderCrossValidatorTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderEvaluatorTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderTool;
import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
@@ -96,6 +97,7 @@ public final class CLI {
tools.add(new TokenNameFinderTool());
tools.add(new TokenNameFinderTrainerTool());
tools.add(new TokenNameFinderEvaluatorTool());
+ tools.add(new TokenNameFinderCrossValidatorTool());
tools.add(new TokenNameFinderConverterTool());
tools.add(new CensusDictionaryCreatorTool());
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java?rev=1130913&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java Fri Jun 3 06:38:13 2011
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.namefind;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderCrossValidator;
+import opennlp.tools.util.ObjectStream;
+
+public final class TokenNameFinderCrossValidatorTool implements CmdLineTool {
+
+ public String getName() {
+ return "TokenNameFinderCrossValidator";
+ }
+
+ public String getShortDescription() {
+ return "10-fold cross validator for the learnable Name Finder";
+ }
+
+ public String getHelp() {
+ return "Usage: " + CLI.CMD + " " + getName() + " "
+ + TrainingParameters.getParameterUsage() + " -data trainData\n"
+ + TrainingParameters.getDescription();
+ }
+
+ public void run(String[] args) {
+ if (args.length < 6) {
+ System.out.println(getHelp());
+ throw new TerminateToolException(1);
+ }
+
+ TrainingParameters parameters = new TrainingParameters(args);
+
+ if (!parameters.isValid()) {
+ System.out.println(getHelp());
+ throw new TerminateToolException(1);
+ }
+
+ opennlp.tools.util.TrainingParameters mlParams = CmdLineUtil
+ .loadTrainingParameters(CmdLineUtil.getParameter("-params", args),
+ false);
+
+ byte featureGeneratorBytes[] = TokenNameFinderTrainerTool
+ .openFeatureGeneratorBytes(parameters.getFeatureGenDescriptorFile());
+
+ Map<String, Object> resources = TokenNameFinderTrainerTool
+ .loadResources(parameters.getResourceDirectory());
+
+ File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
+ CmdLineUtil.checkInputFile("Training Data", trainingDataInFile);
+
+ ObjectStream<NameSample> sampleStream = TokenNameFinderTrainerTool
+ .openSampleData("Training Data", trainingDataInFile,
+ parameters.getEncoding());
+
+ TokenNameFinderCrossValidator validator;
+
+ try {
+ if (mlParams == null) {
+ validator = new TokenNameFinderCrossValidator(parameters.getLanguage(), parameters.getType(),
+ featureGeneratorBytes, resources, parameters.getNumberOfIterations(),
+ parameters.getCutoff());
+ } else {
+ validator = new TokenNameFinderCrossValidator(parameters.getLanguage(), parameters.getType(), mlParams,
+ featureGeneratorBytes, resources);
+ }
+ validator.evaluate(sampleStream, 10);
+ } catch (IOException e) {
+ CmdLineUtil.printTrainingIoError(e);
+ throw new TerminateToolException(-1);
+ } finally {
+ try {
+ sampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ System.out.println("done");
+
+ System.out.println();
+
+ System.out.println(validator.getFMeasure());
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Fri Jun 3 06:38:13 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -66,41 +67,19 @@ public final class TokenNameFinderTraine
return new NameSampleDataStream(lineStream);
}
- public void run(String[] args) {
-
- if (args.length < 8) {
- System.out.println(getHelp());
- throw new TerminateToolException(1);
- }
-
- TrainingParameters parameters = new TrainingParameters(args);
-
- if(!parameters.isValid()) {
- System.out.println(getHelp());
- throw new TerminateToolException(1);
- }
-
- opennlp.tools.util.TrainingParameters mlParams =
- CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
-
- File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
- File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-
-
+ static byte[] openFeatureGeneratorBytes(String featureGenDescriptorFile) {
byte featureGeneratorBytes[] = null;
-
// load descriptor file into memory
- if (parameters.getFeatureGenDescriptorFile() != null) {
- InputStream bytesIn =
- CmdLineUtil.openInFile(new File(parameters.getFeatureGenDescriptorFile()));
-
+ if (featureGenDescriptorFile != null) {
+ InputStream bytesIn = CmdLineUtil.openInFile(new File(
+ featureGenDescriptorFile));
+
try {
featureGeneratorBytes = ModelUtil.read(bytesIn);
} catch (IOException e) {
CmdLineUtil.printTrainingIoError(e);
throw new TerminateToolException(-1);
- }
- finally {
+ } finally {
try {
bytesIn.close();
} catch (IOException e) {
@@ -108,71 +87,97 @@ public final class TokenNameFinderTraine
}
}
}
-
- // TODO: Support Custom resources:
- // Must be loaded into memory, or written to tmp file until descriptor
- // is loaded which defines parses when model is loaded
-
- String resourceDirectory = parameters.getResourceDirectory();
-
+ return featureGeneratorBytes;
+ }
+
+ static Map<String, Object> loadResources(String resourceDirectory) {
Map<String, Object> resources = new HashMap<String, Object>();
-
+
if (resourceDirectory != null) {
-
- Map<String, ArtifactSerializer> artifactSerializers =
- TokenNameFinderModel.createArtifactSerializers();
-
+
+ Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
+ .createArtifactSerializers();
+
File resourcePath = new File(resourceDirectory);
-
+
File resourceFiles[] = resourcePath.listFiles();
-
+
// TODO: Filter files, also files with start with a dot
for (File resourceFile : resourceFiles) {
-
+
// TODO: Move extension extracting code to method and
- // write unit test for it
-
+ // write unit test for it
+
// extract file ending
String resourceName = resourceFile.getName();
-
+
int lastDot = resourceName.lastIndexOf('.');
-
+
if (lastDot == -1) {
continue;
}
-
+
String ending = resourceName.substring(lastDot + 1);
-
+
// lookup serializer from map
ArtifactSerializer serializer = artifactSerializers.get(ending);
-
+
// TODO: Do different? For now just ignore ....
if (serializer == null)
continue;
-
+
InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
-
+
try {
resources.put(resourceName, serializer.create(resoruceIn));
- }
- catch (InvalidFormatException e) {
+ } catch (InvalidFormatException e) {
// TODO: Fix exception handling
e.printStackTrace();
- }
- catch (IOException e) {
+ } catch (IOException e) {
// TODO: Fix exception handling
e.printStackTrace();
- }
- finally {
+ } finally {
try {
resoruceIn.close();
- }
- catch (IOException e) {
+ } catch (IOException e) {
}
}
}
}
+
+ return resources;
+ }
+
+ public void run(String[] args) {
+ if (args.length < 8) {
+ System.out.println(getHelp());
+ throw new TerminateToolException(1);
+ }
+
+ TrainingParameters parameters = new TrainingParameters(args);
+
+ if(!parameters.isValid()) {
+ System.out.println(getHelp());
+ throw new TerminateToolException(1);
+ }
+
+ opennlp.tools.util.TrainingParameters mlParams =
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
+
+ File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
+ File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
+
+
+ byte featureGeneratorBytes[] = openFeatureGeneratorBytes(parameters.getFeatureGenDescriptorFile());
+
+
+ // TODO: Support Custom resources:
+ // Must be loaded into memory, or written to tmp file until descriptor
+ // is loaded which defines parses when model is loaded
+
+ Map<String, Object> resources = loadResources(parameters.getResourceDirectory());
+
CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
parameters.getEncoding());
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java Fri Jun 3 06:38:13 2011
@@ -15,14 +15,14 @@
* limitations under the License.
*/
-
package opennlp.tools.namefind;
import java.io.IOException;
import java.util.Collections;
+import java.util.Map;
-import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.FMeasure;
@@ -31,37 +31,144 @@ public class TokenNameFinderCrossValidat
private final String languageCode;
private final int cutoff;
private final int iterations;
- private FMeasure fmeasure = new FMeasure();
+ private final TrainingParameters params;
+ private final String type;
+ private final byte[] featureGeneratorBytes;
+ private final Map<String, Object> resources;
- public TokenNameFinderCrossValidator(String languageCode, int cutoff, int iterations) {
+
+ private FMeasure fmeasure = new FMeasure();
+
+ /**
+ * Name finder cross validator
+ *
+ * @param languageCode
+ * the language of the training data
+ * @param cutoff
+ * @param iterations
+ */
+ public TokenNameFinderCrossValidator(String languageCode, int cutoff,
+ int iterations) {
+ this(languageCode, null, cutoff, iterations);
+ }
+
+ /**
+ * Name finder cross validator
+ *
+ * @param languageCode
+ * the language of the training data
+ * @param type
+ * null or an override type for all types in the training data
+ * @param cutoff
+ * specifies the min number of times a feature must be seen
+ * @param iterations
+ * the number of iterations
+ */
+ public TokenNameFinderCrossValidator(String languageCode, String type,
+ int cutoff, int iterations) {
this.languageCode = languageCode;
this.cutoff = cutoff;
this.iterations = iterations;
+ this.type = type;
+
+ this.params = null;
+ this.featureGeneratorBytes = null;
+ this.resources = Collections.<String, Object>emptyMap();
}
-
- public void evaluate(ObjectStream<NameSample> samples, int nFolds) throws IOException,
- InvalidFormatException, IOException {
- CrossValidationPartitioner<NameSample> partitioner =
- new CrossValidationPartitioner<NameSample>(samples, nFolds);
+
+ /**
+ * Name finder cross validator
+ *
+ * @param languageCode
+ * the language of the training data
+ * @param type
+ * null or an override type for all types in the training data
+ * @param featureGeneratorBytes
+ * descriptor to configure the feature generation or null
+ * @param resources
+ * the resources for the name finder or null if none
+ * @param cutoff
+ * specifies the min number of times a feature must be seen
+ * @param iterations
+ * the number of iterations
+ */
+ public TokenNameFinderCrossValidator(String languageCode, String type,
+ byte[] featureGeneratorBytes,
+ Map<String, Object> resources, int iterations, int cutoff) {
+ this.languageCode = languageCode;
+ this.cutoff = cutoff;
+ this.iterations = iterations;
+ this.type = type;
+ this.featureGeneratorBytes = featureGeneratorBytes;
+ this.resources = resources;
+ this.params = null;
+ }
+
+ /**
+ * Name finder cross validator
+ *
+ * @param languageCode
+ * the language of the training data
+ * @param type
+ * null or an override type for all types in the training data
+ * @param trainParams
+ * machine learning train parameters
+ * @param featureGeneratorBytes
+ * descriptor to configure the feature generation or null
+ * @param resources
+ * the resources for the name finder or null if none
+ */
+ public TokenNameFinderCrossValidator(String languageCode, String type,
+ TrainingParameters trainParams, byte[] featureGeneratorBytes, Map<String, Object> resources) {
+
+ this.languageCode = languageCode;
+ this.cutoff = -1;
+ this.iterations = -1;
+ this.type = type;
+ this.featureGeneratorBytes = featureGeneratorBytes;
+ this.resources = resources;
+
+ this.params = trainParams;
+ }
+
+ /**
+ * Starts the evaluation.
+ *
+ * @param samples the data to train and test
+ * @param nFolds number of folds
+ *
+ * @throws IOException
+ */
+ public void evaluate(ObjectStream<NameSample> samples, int nFolds)
+ throws IOException {
+ CrossValidationPartitioner<NameSample> partitioner = new CrossValidationPartitioner<NameSample>(
+ samples, nFolds);
+
while (partitioner.hasNext()) {
-
- CrossValidationPartitioner.TrainingSampleStream<NameSample> trainingSampleStream =
- partitioner.next();
-
- TokenNameFinderModel model = NameFinderME.train(languageCode, null, trainingSampleStream,
- Collections.<String, Object>emptyMap(), cutoff, iterations);
-
- // do testing
- TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(
- new NameFinderME(model));
-
- evaluator.evaluate(trainingSampleStream.getTestSampleStream());
-
- fmeasure.mergeInto(evaluator.getFMeasure());
- }
+
+ CrossValidationPartitioner.TrainingSampleStream<NameSample> trainingSampleStream = partitioner
+ .next();
+
+ TokenNameFinderModel model;
+ if (params == null) {
+ model = NameFinderME.train(languageCode, type, trainingSampleStream,
+ featureGeneratorBytes, resources, iterations, cutoff);
+ } else {
+ model = opennlp.tools.namefind.NameFinderME.train(languageCode, type,
+ trainingSampleStream, params, featureGeneratorBytes, resources);
+ }
+
+ // do testing
+ TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(
+ new NameFinderME(model));
+
+ evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+ fmeasure.mergeInto(evaluator.getFMeasure());
+ }
}
-
+
public FMeasure getFMeasure() {
return fmeasure;
}