You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/25 11:12:13 UTC

svn commit: r1127447 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind: TokenNameFinderTrainerTool.java TrainingParameters.java

Author: joern
Date: Wed May 25 09:12:13 2011
New Revision: 1127447

URL: http://svn.apache.org/viewvc?rev=1127447&view=rev
Log:
OPENNLP-17 Added support for custom feature generator

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1127447&r1=1127446&r2=1127447&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Wed May 25 09:12:13 2011
@@ -20,8 +20,11 @@ package opennlp.tools.cmdline.namefind;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
 
 import opennlp.model.TrainUtil;
 import opennlp.tools.cmdline.CLI;
@@ -31,8 +34,11 @@ import opennlp.tools.cmdline.TerminateTo
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.namefind.NameSampleDataStream;
 import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.ModelUtil;
 
 public final class TokenNameFinderTrainerTool implements CmdLineTool {
 
@@ -82,6 +88,93 @@ public final class TokenNameFinderTraine
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
     
+    
+    byte featureGeneratorBytes[] = null;
+    
+    // load descriptor file into memory
+    if (parameters.getFeatureGenDescriptorFile() != null) {
+      InputStream bytesIn = 
+          CmdLineUtil.openInFile(new File(parameters.getFeatureGenDescriptorFile()));
+      
+      try {
+        featureGeneratorBytes = ModelUtil.read(bytesIn);
+      } catch (IOException e) {
+        CmdLineUtil.printTrainingIoError(e);
+        throw new TerminateToolException(-1);
+      }
+      finally {
+        try {
+          bytesIn.close();
+        } catch (IOException e) {
+          // sorry that this can fail
+        }
+      }
+    }
+    
+    // TODO: Support Custom resources: 
+    //       Must be loaded into memory, or written to tmp file until descriptor 
+    //       is loaded which defines parses when model is loaded
+    
+    String resourceDirectory = parameters.getResourceDirectory();
+    
+    Map<String, Object> resources = new HashMap<String, Object>();
+    
+    if (resourceDirectory != null) {
+      
+      Map<String, ArtifactSerializer> artifactSerializers = 
+          TokenNameFinderModel.createArtifactSerializers();
+      
+      File resourcePath = new File(resourceDirectory);
+      
+      File resourceFiles[] = resourcePath.listFiles();
+      
+      // TODO: Filter files, also files with start with a dot
+      for (File resourceFile : resourceFiles) {
+        
+        // TODO: Move extension extracting code to method and
+        //       write unit test for it
+        
+        // extract file ending
+        String resourceName = resourceFile.getName();
+        
+        int lastDot = resourceName.lastIndexOf('.');
+        
+        if (lastDot == -1) {
+          continue;
+        }
+        
+        String ending = resourceName.substring(lastDot + 1);
+        
+        // lookup serializer from map
+        ArtifactSerializer serializer = artifactSerializers.get(ending);
+        
+        // TODO: Do different? For now just ignore ....
+        if (serializer == null)
+          continue;
+        
+        InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
+        
+        try {
+          resources.put(resourceName, serializer.create(resoruceIn));
+        }
+        catch (InvalidFormatException e) {
+          // TODO: Fix exception handling
+          e.printStackTrace();
+        }
+        catch (IOException e) {
+          // TODO: Fix exception handling
+          e.printStackTrace();
+        }
+        finally {
+          try {
+            resoruceIn.close();
+          }
+          catch (IOException e) {
+          }
+        }
+      }
+    }
+    
     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
     ObjectStream<NameSample> sampleStream = openSampleData("Training", trainingDataInFile,
         parameters.getEncoding());
@@ -90,8 +183,8 @@ public final class TokenNameFinderTraine
     try {
       if (mlParams == null) {
       model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(),
-           sampleStream, Collections.<String, Object>emptyMap(),
-           parameters.getNumberOfIterations(), parameters.getCutoff());
+           sampleStream, featureGeneratorBytes, resources, parameters.getNumberOfIterations(),
+           parameters.getCutoff());
       }
       else {
         model = opennlp.tools.namefind.NameFinderME.train(parameters.getLanguage(), parameters.getType(), sampleStream, mlParams, null,

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java?rev=1127447&r1=1127446&r2=1127447&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TrainingParameters.java Wed May 25 09:12:13 2011
@@ -26,9 +26,13 @@ import opennlp.tools.cmdline.CmdLineUtil
 class TrainingParameters extends BasicTrainingParameters {
 
   private static final String TYPE_PARAM = "-type";
+  private static final String FEATURE_GEN_PARAM = "-featuregen";
   
   private String type;
   
+  private String featureGeneratorDescription;
+  private String resourceDirectory;
+  
   TrainingParameters(String args[]) {
     super(args);
    
@@ -36,14 +40,27 @@ class TrainingParameters extends BasicTr
     
     if (type == null)
       type = "default";
+    
+    featureGeneratorDescription = CmdLineUtil.getParameter(FEATURE_GEN_PARAM, args);
+    
+    resourceDirectory = CmdLineUtil.getParameter("-resources", args);
   }
   
   String getType() {
     return type;
   }
   
+  String getFeatureGenDescriptorFile() {
+    return featureGeneratorDescription;
+  }
+  
+  // TODO: Add parameter to description
+  String getResourceDirectory() {
+    return resourceDirectory;
+  }
+  
   public static String getParameterUsage() {
-    return BasicTrainingParameters.getParameterUsage() + " [" + TYPE_PARAM +" type]";
+    return BasicTrainingParameters.getParameterUsage() + " [" + TYPE_PARAM +" type]" + " [" + FEATURE_GEN_PARAM +" type]";
   }
   
   public static String getDescription() {