You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/04/20 16:07:07 UTC
opennlp git commit: OPENNLP-1034: Move serializers to resource
mapping to GeneratorFactory
Repository: opennlp
Updated Branches:
refs/heads/master 041507d3a -> f74a86f4b
OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory
Closes #173
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f74a86f4
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f74a86f4
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f74a86f4
Branch: refs/heads/master
Commit: f74a86f4b6a6f93d3a1e10f2a4852c5898feefb3
Parents: 041507d
Author: J�rn Kottmann <jo...@apache.org>
Authored: Wed Apr 19 18:34:15 2017 +0200
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Apr 20 18:05:40 2017 +0200
----------------------------------------------------------------------
.../TokenNameFinderCrossValidatorTool.java | 10 ++-
.../namefind/TokenNameFinderTrainerTool.java | 77 ++++----------------
.../postag/POSTaggerCrossValidatorTool.java | 9 ++-
.../cmdline/postag/POSTaggerTrainerTool.java | 11 ++-
.../tools/util/featuregen/GeneratorFactory.java | 41 ++++++++++-
.../util/featuregen/GeneratorFactoryTest.java | 2 +-
6 files changed, 78 insertions(+), 72 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
index 0ee3738..6e62577 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
@@ -71,8 +71,14 @@ public final class TokenNameFinderCrossValidatorTool
byte[] featureGeneratorBytes =
TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());
- Map<String, Object> resources =
- TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+ Map<String, Object> resources;
+
+ try {
+ resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+ }
+ catch (IOException e) {
+ throw new TerminateToolException(-1,"IO error while loading resources", e);
+ }
if (params.getNameTypes() != null) {
String[] nameTypes = params.getNameTypes().split(",");
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
index 4fb8cb9..f3cef48 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
@@ -20,13 +20,9 @@ package opennlp.tools.cmdline.namefind;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
-import org.w3c.dom.Element;
-
import opennlp.tools.cmdline.AbstractTrainerTool;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.TerminateToolException;
@@ -89,79 +85,31 @@ public final class TokenNameFinderTrainerTool
* @param featureGenDescriptor the feature xml descriptor
* @return a map consisting of the file name of the resource and its corresponding Object
*/
- public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) {
+ public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor)
+ throws IOException {
Map<String, Object> resources = new HashMap<>();
if (resourcePath != null) {
+ Map<String, ArtifactSerializer> artifactSerializers = new HashMap<>();
- Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
- .createArtifactSerializers();
- List<Element> elements = new ArrayList<>();
- ArtifactSerializer serializer = null;
-
-
- // TODO: If there is descriptor file, it should be consulted too
if (featureGenDescriptor != null) {
try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) {
artifactSerializers.putAll(
- GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn));
- } catch (IOException e) {
- // TODO: Improve error handling!
- e.printStackTrace();
- }
-
- try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) {
- elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
- } catch (IOException e) {
- e.printStackTrace();
+ GeneratorFactory.extractArtifactSerializerMappings(xmlDescriptorIn));
}
}
- File[] resourceFiles = resourcePath.listFiles();
-
- for (File resourceFile : resourceFiles) {
- String resourceName = resourceFile.getName();
- //gettting the serializer key from the element tag name
- //if the element contains a dict attribute
- for (Element xmlElement : elements) {
- String dictName = xmlElement.getAttribute("dict");
- if (dictName != null && dictName.equals(resourceName)) {
- serializer = artifactSerializers.get(xmlElement.getTagName());
- }
- }
- // TODO: Do different? For now just ignore ....
- if (serializer == null)
- continue;
-
- try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) {
- resources.put(resourceName, serializer.create(resourceIn));
- } catch (IOException e) {
- // TODO: Fix exception handling
- e.printStackTrace();
+ for (Map.Entry<String, ArtifactSerializer> serializerMapping : artifactSerializers.entrySet()) {
+ String resourceName = serializerMapping.getKey();
+ try (InputStream resourceIn = CmdLineUtil.openInFile(new File(resourcePath, resourceName))) {
+ resources.put(resourceName, serializerMapping.getValue().create(resourceIn));
}
}
}
return resources;
}
- /**
- * Calls a loadResources method above to load any external resource required for training.
- * @param resourceDirectory the directory where the resources are to be found
- * @param featureGeneratorDescriptor the xml feature generator
- * @return a map containing the file name of the resource and its mapped Object
- */
- static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) {
-
- if (resourceDirectory != null) {
- File resourcePath = new File(resourceDirectory);
-
- return loadResources(resourcePath, featureGeneratorDescriptor);
- }
-
- return new HashMap<>();
- }
-
public void run(String format, String[] args) {
super.run(format, args);
@@ -174,12 +122,17 @@ public final class TokenNameFinderTrainerTool
byte[] featureGeneratorBytes = openFeatureGeneratorBytes(params.getFeaturegen());
-
// TODO: Support Custom resources:
// Must be loaded into memory, or written to tmp file until descriptor
// is loaded which defines parses when model is loaded
- Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen());
+ Map<String, Object> resources;
+ try {
+ resources = loadResources(params.getResources(), params.getFeaturegen());
+ }
+ catch (IOException e) {
+ throw new TerminateToolException(-1, e.getMessage(), e);
+ }
CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
index 67ad2b9..c6a37a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
@@ -77,8 +77,13 @@ public final class POSTaggerCrossValidatorTool
}
}
- Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
- params.getResources(), params.getFeaturegen());
+ Map<String, Object> resources;
+ try {
+ resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+ }
+ catch (IOException e) {
+ throw new TerminateToolException(-1,"IO error while loading resources", e);
+ }
byte[] featureGeneratorBytes =
TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
index b922176..ca614f9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
@@ -67,8 +67,15 @@ public final class POSTaggerTrainerTool
File modelOutFile = params.getModel();
CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);
- Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
- params.getResources(), params.getFeaturegen());
+ Map<String, Object> resources;
+
+ try {
+ resources = TokenNameFinderTrainerTool.loadResources(
+ params.getResources(), params.getFeaturegen());
+ }
+ catch (IOException e) {
+ throw new TerminateToolException(-1,"IO error while loading resources", e);
+ }
byte[] featureGeneratorBytes =
TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
index a1ac72b..5060961 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
@@ -755,7 +755,7 @@ public class GeneratorFactory {
return createGenerator(generatorElement, resourceManager);
}
- public static Map<String, ArtifactSerializer<?>> extractCustomArtifactSerializerMappings(
+ public static Map<String, ArtifactSerializer<?>> extractArtifactSerializerMappings(
InputStream xmlDescriptorIn) throws IOException {
Map<String, ArtifactSerializer<?>> mapping = new HashMap<>();
@@ -764,7 +764,6 @@ public class GeneratorFactory {
XPath xPath = XPathFactory.newInstance().newXPath();
-
NodeList customElements;
try {
XPathExpression exp = xPath.compile("//custom");
@@ -774,7 +773,6 @@ public class GeneratorFactory {
}
for (int i = 0; i < customElements.getLength(); i++) {
-
if (customElements.item(i) instanceof Element) {
Element customElement = (Element) customElements.item(i);
@@ -788,6 +786,43 @@ public class GeneratorFactory {
}
}
}
+
+ NodeList allElements;
+ try {
+ XPathExpression exp = xPath.compile("//*");
+ allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
+ } catch (XPathExpressionException e) {
+ throw new IllegalStateException("The hard coded XPath expression should always be valid!");
+ }
+
+ for (int i = 0; i < allElements.getLength(); i++) {
+ if (allElements.item(i) instanceof Element) {
+ Element xmlElement = (Element) allElements.item(i);
+
+ String dictName = xmlElement.getAttribute("dict");
+ if (dictName != null) {
+
+ switch (xmlElement.getTagName()) {
+ case "wordcluster":
+ mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer());
+ break;
+
+ case "brownclustertoken":
+ mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+ break;
+
+ case "brownclustertokenclass"://, ;
+ mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+ break;
+
+ case "brownclusterbigram": //, ;
+ mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+ break;
+ }
+ }
+ }
+ }
+
return mapping;
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
index 8a48575..dd569b0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
@@ -120,7 +120,7 @@ public class GeneratorFactoryTest {
"/opennlp/tools/util/featuregen/CustomClassLoadingWithSerializers.xml");
Map<String, ArtifactSerializer<?>> mapping =
- GeneratorFactory.extractCustomArtifactSerializerMappings(descIn);
+ GeneratorFactory.extractArtifactSerializerMappings(descIn);
Assert.assertTrue(mapping.get("test.resource") instanceof WordClusterDictionarySerializer);
}