You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/06/03 08:38:13 UTC

svn commit: r1130913 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/ cmdline/namefind/ namefind/

Author: colen
Date: Fri Jun  3 06:38:13 2011
New Revision: 1130913

URL: http://svn.apache.org/viewvc?rev=1130913&view=rev
Log:
OPENNLP-178 Added cross validation cmd line tool for the name finder

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java Fri Jun  3 06:38:13 2011
@@ -35,6 +35,7 @@ import opennlp.tools.cmdline.doccat.Docc
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
 import opennlp.tools.cmdline.namefind.CensusDictionaryCreatorTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderConverterTool;
+import opennlp.tools.cmdline.namefind.TokenNameFinderCrossValidatorTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderEvaluatorTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderTool;
 import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
@@ -96,6 +97,7 @@ public final class CLI {
     tools.add(new TokenNameFinderTool());
     tools.add(new TokenNameFinderTrainerTool());
     tools.add(new TokenNameFinderEvaluatorTool());
+    tools.add(new TokenNameFinderCrossValidatorTool());
     tools.add(new TokenNameFinderConverterTool());
     tools.add(new CensusDictionaryCreatorTool());
     

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java?rev=1130913&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java Fri Jun  3 06:38:13 2011
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.namefind;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.TokenNameFinderCrossValidator;
+import opennlp.tools.util.ObjectStream;
+
+public final class TokenNameFinderCrossValidatorTool implements CmdLineTool {
+
+  public String getName() {
+    return "TokenNameFinderCrossValidator";
+  }
+
+  public String getShortDescription() {
+    return "10-fold cross validator for the learnable Name Finder";
+  }
+
+  public String getHelp() {
+    return "Usage: " + CLI.CMD + " " + getName() + " "
+        + TrainingParameters.getParameterUsage() + " -data trainData\n"
+        + TrainingParameters.getDescription();
+  }
+
+  public void run(String[] args) {
+    if (args.length < 6) {
+      System.out.println(getHelp());
+      throw new TerminateToolException(1);
+    }
+
+    TrainingParameters parameters = new TrainingParameters(args);
+
+    if (!parameters.isValid()) {
+      System.out.println(getHelp());
+      throw new TerminateToolException(1);
+    }
+
+    opennlp.tools.util.TrainingParameters mlParams = CmdLineUtil
+        .loadTrainingParameters(CmdLineUtil.getParameter("-params", args),
+            false);
+
+    byte featureGeneratorBytes[] = TokenNameFinderTrainerTool
+        .openFeatureGeneratorBytes(parameters.getFeatureGenDescriptorFile());
+
+    Map<String, Object> resources = TokenNameFinderTrainerTool
+        .loadResources(parameters.getResourceDirectory());
+
+    File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
+    CmdLineUtil.checkInputFile("Training Data", trainingDataInFile);
+
+    ObjectStream<NameSample> sampleStream = TokenNameFinderTrainerTool
+        .openSampleData("Training Data", trainingDataInFile,
+            parameters.getEncoding());
+
+    TokenNameFinderCrossValidator validator;
+
+    try {
+      if (mlParams == null) {
+        validator = new TokenNameFinderCrossValidator(parameters.getLanguage(), parameters.getType(),
+             featureGeneratorBytes, resources, parameters.getNumberOfIterations(),
+            parameters.getCutoff());
+      } else {
+        validator = new TokenNameFinderCrossValidator(parameters.getLanguage(), parameters.getType(), mlParams,
+            featureGeneratorBytes, resources);
+      }
+      validator.evaluate(sampleStream, 10);
+    } catch (IOException e) {
+      CmdLineUtil.printTrainingIoError(e);
+      throw new TerminateToolException(-1);
+    } finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    System.out.println("done");
+
+    System.out.println();
+
+    System.out.println(validator.getFMeasure());
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Fri Jun  3 06:38:13 2011
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -66,41 +67,19 @@ public final class TokenNameFinderTraine
     return new NameSampleDataStream(lineStream);
   }
   
-  public void run(String[] args) {
-    
-    if (args.length < 8) {
-      System.out.println(getHelp());
-      throw new TerminateToolException(1);
-    }
-    
-    TrainingParameters parameters = new TrainingParameters(args);
-    
-    if(!parameters.isValid()) {
-      System.out.println(getHelp());
-      throw new TerminateToolException(1);
-    }
-    
-    opennlp.tools.util.TrainingParameters mlParams = 
-      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
-    
-    File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
-    File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
-    
-    
+  static byte[] openFeatureGeneratorBytes(String featureGenDescriptorFile) {
     byte featureGeneratorBytes[] = null;
-    
     // load descriptor file into memory
-    if (parameters.getFeatureGenDescriptorFile() != null) {
-      InputStream bytesIn = 
-          CmdLineUtil.openInFile(new File(parameters.getFeatureGenDescriptorFile()));
-      
+    if (featureGenDescriptorFile != null) {
+      InputStream bytesIn = CmdLineUtil.openInFile(new File(
+          featureGenDescriptorFile));
+
       try {
         featureGeneratorBytes = ModelUtil.read(bytesIn);
       } catch (IOException e) {
         CmdLineUtil.printTrainingIoError(e);
         throw new TerminateToolException(-1);
-      }
-      finally {
+      } finally {
         try {
           bytesIn.close();
         } catch (IOException e) {
@@ -108,71 +87,97 @@ public final class TokenNameFinderTraine
         }
       }
     }
-    
-    // TODO: Support Custom resources: 
-    //       Must be loaded into memory, or written to tmp file until descriptor 
-    //       is loaded which defines parses when model is loaded
-    
-    String resourceDirectory = parameters.getResourceDirectory();
-    
+    return featureGeneratorBytes;
+  }
+  
+  static Map<String, Object> loadResources(String resourceDirectory) {
     Map<String, Object> resources = new HashMap<String, Object>();
-    
+
     if (resourceDirectory != null) {
-      
-      Map<String, ArtifactSerializer> artifactSerializers = 
-          TokenNameFinderModel.createArtifactSerializers();
-      
+
+      Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
+          .createArtifactSerializers();
+
       File resourcePath = new File(resourceDirectory);
-      
+
       File resourceFiles[] = resourcePath.listFiles();
-      
+
       // TODO: Filter files, also files with start with a dot
       for (File resourceFile : resourceFiles) {
-        
+
         // TODO: Move extension extracting code to method and
-        //       write unit test for it
-        
+        // write unit test for it
+
         // extract file ending
         String resourceName = resourceFile.getName();
-        
+
         int lastDot = resourceName.lastIndexOf('.');
-        
+
         if (lastDot == -1) {
           continue;
         }
-        
+
         String ending = resourceName.substring(lastDot + 1);
-        
+
         // lookup serializer from map
         ArtifactSerializer serializer = artifactSerializers.get(ending);
-        
+
         // TODO: Do different? For now just ignore ....
         if (serializer == null)
           continue;
-        
+
         InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
-        
+
         try {
           resources.put(resourceName, serializer.create(resoruceIn));
-        }
-        catch (InvalidFormatException e) {
+        } catch (InvalidFormatException e) {
           // TODO: Fix exception handling
           e.printStackTrace();
-        }
-        catch (IOException e) {
+        } catch (IOException e) {
           // TODO: Fix exception handling
           e.printStackTrace();
-        }
-        finally {
+        } finally {
           try {
             resoruceIn.close();
-          }
-          catch (IOException e) {
+          } catch (IOException e) {
           }
         }
       }
     }
+
+    return resources;
+  }
+  
+  public void run(String[] args) {
     
+    if (args.length < 8) {
+      System.out.println(getHelp());
+      throw new TerminateToolException(1);
+    }
+    
+    TrainingParameters parameters = new TrainingParameters(args);
+    
+    if(!parameters.isValid()) {
+      System.out.println(getHelp());
+      throw new TerminateToolException(1);
+    }
+    
+    opennlp.tools.util.TrainingParameters mlParams = 
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
+    
+    File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
+    File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
+    
+    
+    byte featureGeneratorBytes[] = openFeatureGeneratorBytes(parameters.getFeatureGenDescriptorFile());
+    
+    
+    // TODO: Support Custom resources: 
+    //       Must be loaded into memory, or written to tmp file until descriptor 
+    //       is loaded which defines parses when model is loaded
+    
+    Map<String, Object> resources = loadResources(parameters.getResourceDirectory());
+        
     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
     ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
         parameters.getEncoding());

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java?rev=1130913&r1=1130912&r2=1130913&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java Fri Jun  3 06:38:13 2011
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-
 package opennlp.tools.namefind;
 
 import java.io.IOException;
 import java.util.Collections;
+import java.util.Map;
 
-import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.eval.CrossValidationPartitioner;
 import opennlp.tools.util.eval.FMeasure;
 
@@ -31,37 +31,144 @@ public class TokenNameFinderCrossValidat
   private final String languageCode;
   private final int cutoff;
   private final int iterations;
-  private FMeasure fmeasure = new FMeasure();
+  private final TrainingParameters params;
+  private final String type;
+  private final byte[] featureGeneratorBytes;
+  private final Map<String, Object> resources;
   
-  public TokenNameFinderCrossValidator(String languageCode, int cutoff, int iterations) {
+
+  private FMeasure fmeasure = new FMeasure();
+
+  /**
+   * Name finder cross validator
+   *  
+   * @param languageCode 
+    *          the language of the training data
+   * @param cutoff
+   * @param iterations
+   */
+  public TokenNameFinderCrossValidator(String languageCode, int cutoff,
+      int iterations) {
+    this(languageCode, null, cutoff, iterations);
+  }
+
+  /**
+   * Name finder cross validator
+   * 
+   * @param languageCode
+   *          the language of the training data
+   * @param type
+   *          null or an override type for all types in the training data
+   * @param cutoff
+   *          specifies the min number of times a feature must be seen
+   * @param iterations
+   *          the number of iterations
+   */
+  public TokenNameFinderCrossValidator(String languageCode, String type,
+      int cutoff, int iterations) {
     this.languageCode = languageCode;
     this.cutoff = cutoff;
     this.iterations = iterations;
+    this.type = type;
+    
+    this.params = null;
+    this.featureGeneratorBytes = null;
+    this.resources = Collections.<String, Object>emptyMap(); 
   }
-  
-  public void evaluate(ObjectStream<NameSample> samples, int nFolds) throws IOException,
-      InvalidFormatException, IOException {
-    CrossValidationPartitioner<NameSample> partitioner = 
-        new CrossValidationPartitioner<NameSample>(samples, nFolds);
+
+  /**
+   * Name finder cross validator
+   * 
+   * @param languageCode
+   *          the language of the training data
+   * @param type
+   *          null or an override type for all types in the training data
+   * @param featureGeneratorBytes
+   *          descriptor to configure the feature generation or null
+   * @param resources
+   *          the resources for the name finder or null if none
+   * @param cutoff
+   *          specifies the min number of times a feature must be seen
+   * @param iterations
+   *          the number of iterations
+   */
+  public TokenNameFinderCrossValidator(String languageCode, String type,
+      byte[] featureGeneratorBytes,
+      Map<String, Object> resources, int iterations, int cutoff) {
+    this.languageCode = languageCode;
+    this.cutoff = cutoff;
+    this.iterations = iterations;
+    this.type = type;
+    this.featureGeneratorBytes = featureGeneratorBytes;
+    this.resources = resources;
     
+    this.params = null;
+  }
+
+  /**
+   * Name finder cross validator
+   * 
+   * @param languageCode
+   *          the language of the training data
+   * @param type
+   *          null or an override type for all types in the training data
+   * @param trainParams
+   *          machine learning train parameters
+   * @param featureGeneratorBytes
+   *          descriptor to configure the feature generation or null
+   * @param resources
+   *          the resources for the name finder or null if none
+   */
+  public TokenNameFinderCrossValidator(String languageCode, String type,
+      TrainingParameters trainParams, byte[] featureGeneratorBytes, Map<String, Object> resources) {
+
+    this.languageCode = languageCode;
+    this.cutoff = -1;
+    this.iterations = -1;
+    this.type = type;
+    this.featureGeneratorBytes = featureGeneratorBytes;
+    this.resources = resources;
+
+    this.params = trainParams;
+  }
+
+  /**
+   * Starts the evaluation.
+   * 
+   * @param samples the data to train and test
+   * @param nFolds number of folds
+   * 
+   * @throws IOException
+   */
+  public void evaluate(ObjectStream<NameSample> samples, int nFolds)
+      throws IOException {
+    CrossValidationPartitioner<NameSample> partitioner = new CrossValidationPartitioner<NameSample>(
+        samples, nFolds);
+
     while (partitioner.hasNext()) {
-      
-      CrossValidationPartitioner.TrainingSampleStream<NameSample> trainingSampleStream =
-          partitioner.next();
-      
-      TokenNameFinderModel model = NameFinderME.train(languageCode, null, trainingSampleStream,
-          Collections.<String, Object>emptyMap(), cutoff, iterations);
-       
-       // do testing
-       TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(
-           new NameFinderME(model));
-
-       evaluator.evaluate(trainingSampleStream.getTestSampleStream());
-       
-       fmeasure.mergeInto(evaluator.getFMeasure());
-     }
+
+      CrossValidationPartitioner.TrainingSampleStream<NameSample> trainingSampleStream = partitioner
+          .next();
+
+      TokenNameFinderModel model;
+      if (params == null) {
+        model = NameFinderME.train(languageCode, type, trainingSampleStream,
+            featureGeneratorBytes, resources, iterations, cutoff);
+      } else {
+        model = opennlp.tools.namefind.NameFinderME.train(languageCode, type,
+            trainingSampleStream, params, featureGeneratorBytes, resources);
+      }
+
+      // do testing
+      TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(
+          new NameFinderME(model));
+
+      evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+      fmeasure.mergeInto(evaluator.getFMeasure());
+    }
   }
-  
+
   public FMeasure getFMeasure() {
     return fmeasure;
   }