You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/04/20 16:07:07 UTC

opennlp git commit: OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory

Repository: opennlp
Updated Branches:
  refs/heads/master 041507d3a -> f74a86f4b


OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory

Closes #173


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f74a86f4
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f74a86f4
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f74a86f4

Branch: refs/heads/master
Commit: f74a86f4b6a6f93d3a1e10f2a4852c5898feefb3
Parents: 041507d
Author: J�rn Kottmann <jo...@apache.org>
Authored: Wed Apr 19 18:34:15 2017 +0200
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Apr 20 18:05:40 2017 +0200

----------------------------------------------------------------------
 .../TokenNameFinderCrossValidatorTool.java      | 10 ++-
 .../namefind/TokenNameFinderTrainerTool.java    | 77 ++++----------------
 .../postag/POSTaggerCrossValidatorTool.java     |  9 ++-
 .../cmdline/postag/POSTaggerTrainerTool.java    | 11 ++-
 .../tools/util/featuregen/GeneratorFactory.java | 41 ++++++++++-
 .../util/featuregen/GeneratorFactoryTest.java   |  2 +-
 6 files changed, 78 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
index 0ee3738..6e62577 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
@@ -71,8 +71,14 @@ public final class TokenNameFinderCrossValidatorTool
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());
 
-    Map<String, Object> resources =
-        TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     if (params.getNameTypes() != null) {
       String[] nameTypes = params.getNameTypes().split(",");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
index 4fb8cb9..f3cef48 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
@@ -20,13 +20,9 @@ package opennlp.tools.cmdline.namefind;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
-import org.w3c.dom.Element;
-
 import opennlp.tools.cmdline.AbstractTrainerTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
@@ -89,79 +85,31 @@ public final class TokenNameFinderTrainerTool
    * @param featureGenDescriptor the feature xml descriptor
    * @return a map consisting of the file name of the resource and its corresponding Object
    */
-  public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) {
+  public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor)
+      throws IOException {
     Map<String, Object> resources = new HashMap<>();
 
     if (resourcePath != null) {
+      Map<String, ArtifactSerializer> artifactSerializers = new HashMap<>();
 
-      Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
-          .createArtifactSerializers();
-      List<Element> elements = new ArrayList<>();
-      ArtifactSerializer serializer = null;
-
-
-      // TODO: If there is descriptor file, it should be consulted too
       if (featureGenDescriptor != null) {
 
         try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) {
           artifactSerializers.putAll(
-              GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn));
-        } catch (IOException e) {
-          // TODO: Improve error handling!
-          e.printStackTrace();
-        }
-
-        try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) {
-          elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
-        } catch (IOException e) {
-          e.printStackTrace();
+              GeneratorFactory.extractArtifactSerializerMappings(xmlDescriptorIn));
         }
       }
 
-      File[] resourceFiles = resourcePath.listFiles();
-
-      for (File resourceFile : resourceFiles) {
-        String resourceName = resourceFile.getName();
-        //gettting the serializer key from the element tag name
-        //if the element contains a dict attribute
-        for (Element xmlElement : elements) {
-          String dictName = xmlElement.getAttribute("dict");
-          if (dictName != null && dictName.equals(resourceName)) {
-            serializer = artifactSerializers.get(xmlElement.getTagName());
-          }
-        }
-        // TODO: Do different? For now just ignore ....
-        if (serializer == null)
-          continue;
-
-        try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) {
-          resources.put(resourceName, serializer.create(resourceIn));
-        } catch (IOException e) {
-          // TODO: Fix exception handling
-          e.printStackTrace();
+      for (Map.Entry<String, ArtifactSerializer> serializerMapping : artifactSerializers.entrySet()) {
+        String resourceName = serializerMapping.getKey();
+        try (InputStream resourceIn = CmdLineUtil.openInFile(new File(resourcePath, resourceName))) {
+          resources.put(resourceName, serializerMapping.getValue().create(resourceIn));
         }
       }
     }
     return resources;
   }
 
-  /**
-   * Calls a loadResources method above to load any external resource required for training.
-   * @param resourceDirectory the directory where the resources are to be found
-   * @param featureGeneratorDescriptor the xml feature generator
-   * @return a map containing the file name of the resource and its mapped Object
-   */
-  static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) {
-
-    if (resourceDirectory != null) {
-      File resourcePath = new File(resourceDirectory);
-
-      return loadResources(resourcePath, featureGeneratorDescriptor);
-    }
-
-    return new HashMap<>();
-  }
-
   public void run(String format, String[] args) {
     super.run(format, args);
 
@@ -174,12 +122,17 @@ public final class TokenNameFinderTrainerTool
 
     byte[] featureGeneratorBytes = openFeatureGeneratorBytes(params.getFeaturegen());
 
-
     // TODO: Support Custom resources:
     //       Must be loaded into memory, or written to tmp file until descriptor
     //       is loaded which defines parses when model is loaded
 
-    Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+    try {
+      resources = loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1, e.getMessage(), e);
+    }
 
     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
index 67ad2b9..c6a37a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
@@ -77,8 +77,13 @@ public final class POSTaggerCrossValidatorTool
       }
     }
 
-    Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
-        params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
index b922176..ca614f9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
@@ -67,8 +67,15 @@ public final class POSTaggerTrainerTool
     File modelOutFile = params.getModel();
     CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);
 
-    Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
-        params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(
+          params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
index a1ac72b..5060961 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
@@ -755,7 +755,7 @@ public class GeneratorFactory {
     return createGenerator(generatorElement, resourceManager);
   }
 
-  public static Map<String, ArtifactSerializer<?>> extractCustomArtifactSerializerMappings(
+  public static Map<String, ArtifactSerializer<?>> extractArtifactSerializerMappings(
       InputStream xmlDescriptorIn) throws IOException {
 
     Map<String, ArtifactSerializer<?>> mapping = new HashMap<>();
@@ -764,7 +764,6 @@ public class GeneratorFactory {
 
     XPath xPath = XPathFactory.newInstance().newXPath();
 
-
     NodeList customElements;
     try {
       XPathExpression exp = xPath.compile("//custom");
@@ -774,7 +773,6 @@ public class GeneratorFactory {
     }
 
     for (int i = 0; i < customElements.getLength(); i++) {
-
       if (customElements.item(i) instanceof Element) {
         Element customElement = (Element) customElements.item(i);
 
@@ -788,6 +786,43 @@ public class GeneratorFactory {
         }
       }
     }
+
+    NodeList allElements;
+    try {
+      XPathExpression exp = xPath.compile("//*");
+      allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
+    } catch (XPathExpressionException e) {
+      throw new IllegalStateException("The hard coded XPath expression should always be valid!");
+    }
+
+    for (int i = 0; i < allElements.getLength(); i++) {
+      if (allElements.item(i) instanceof Element) {
+        Element xmlElement = (Element) allElements.item(i);
+
+        String dictName = xmlElement.getAttribute("dict");
+        if (dictName != null) {
+
+          switch (xmlElement.getTagName()) {
+            case "wordcluster":
+              mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer());
+              break;
+
+            case "brownclustertoken":
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+
+            case "brownclustertokenclass"://, ;
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+
+            case "brownclusterbigram": //, ;
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+          }
+        }
+      }
+    }
+
     return mapping;
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
index 8a48575..dd569b0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
@@ -120,7 +120,7 @@ public class GeneratorFactoryTest {
         "/opennlp/tools/util/featuregen/CustomClassLoadingWithSerializers.xml");
 
     Map<String, ArtifactSerializer<?>> mapping =
-        GeneratorFactory.extractCustomArtifactSerializerMappings(descIn);
+        GeneratorFactory.extractArtifactSerializerMappings(descIn);
 
     Assert.assertTrue(mapping.get("test.resource") instanceof WordClusterDictionarySerializer);
   }