You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2014/10/27 19:13:06 UTC
svn commit: r1634634 - in
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/namefind/TokenNameFinderTrainerTool.java
namefind/TokenNameFinderModel.java util/featuregen/GeneratorFactory.java
Author: ragerri
Date: Mon Oct 27 18:13:06 2014
New Revision: 1634634
URL: http://svn.apache.org/r1634634
Log:
OPENNLP-725 now the serializer is chosen from dict attribute and element tag
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Mon Oct 27 18:13:06 2014
@@ -20,7 +20,9 @@ package opennlp.tools.cmdline.namefind;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import opennlp.tools.cmdline.AbstractTrainerTool;
@@ -34,13 +36,14 @@ import opennlp.tools.namefind.NameSample
import opennlp.tools.namefind.NameSampleTypeFilter;
import opennlp.tools.namefind.TokenNameFinderFactory;
import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.postag.POSTaggerFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.featuregen.GeneratorFactory;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.ModelUtil;
+import org.w3c.dom.Element;
+
public final class TokenNameFinderTrainerTool
extends AbstractTrainerTool<NameSample, TrainerToolParams> {
@@ -92,6 +95,8 @@ public final class TokenNameFinderTraine
Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
.createArtifactSerializers();
+ List<Element> elements = new ArrayList<Element>();
+ ArtifactSerializer serializer = null;
// TODO: If there is descriptor file, it should be consulted too
@@ -105,38 +110,34 @@ public final class TokenNameFinderTraine
// TODO: Improve error handling!
e.printStackTrace();
}
+ InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor);
+ try {
+ elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
File resourceFiles[] = resourcePath.listFiles();
-
- // TODO: Filter files, also files with start with a dot
+
for (File resourceFile : resourceFiles) {
-
- // TODO: Move extension extracting code to method and
- // write unit test for it
-
- // extract file ending
String resourceName = resourceFile.getName();
-
- int lastDot = resourceName.lastIndexOf('.');
-
- if (lastDot == -1) {
- continue;
+ //gettting the serializer key from the element tag name
+ //if the element contains a dict attribute
+ for (Element xmlElement : elements) {
+ String dictName = xmlElement.getAttribute("dict");
+ if (dictName != null && dictName.equals(resourceName)) {
+ serializer = artifactSerializers.get(xmlElement.getTagName());
+ }
}
-
- String ending = resourceName.substring(lastDot + 1);
-
- // lookup serializer from map
- ArtifactSerializer serializer = artifactSerializers.get(ending);
-
// TODO: Do different? For now just ignore ....
if (serializer == null)
continue;
- InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
+ InputStream resourceIn = CmdLineUtil.openInFile(resourceFile);
try {
- resources.put(resourceName, serializer.create(resoruceIn));
+ resources.put(resourceName, serializer.create(resourceIn));
} catch (InvalidFormatException e) {
// TODO: Fix exception handling
e.printStackTrace();
@@ -145,7 +146,7 @@ public final class TokenNameFinderTraine
e.printStackTrace();
} finally {
try {
- resoruceIn.close();
+ resourceIn.close();
} catch (IOException e) {
}
}
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java Mon Oct 27 18:13:06 2014
@@ -263,7 +263,7 @@ public class TokenNameFinderModel extend
Map<String, ArtifactSerializer> serializers = BaseModel.createArtifactSerializers();
serializers.put("featuregen", new ByteArraySerializer());
- serializers.put("w2vclasses", new W2VClassesDictionary.W2VClassesDictionarySerializer());
+ serializers.put("w2vwordcluster", new W2VClassesDictionary.W2VClassesDictionarySerializer());
return serializers;
}
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java Mon Oct 27 18:13:06 2014
@@ -19,10 +19,12 @@ package opennlp.tools.util.featuregen;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
import javax.xml.namespace.QName;
@@ -667,4 +669,28 @@ public class GeneratorFactory {
}
return mapping;
}
+
+ public static List<Element> getDescriptorElements(
+ InputStream xmlDescriptorIn)
+ throws IOException, InvalidFormatException {
+
+ List<Element> elements = new ArrayList<Element>();
+ org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
+ XPath xPath = XPathFactory.newInstance().newXPath();
+ NodeList allElements;
+ try {
+ XPathExpression exp = xPath.compile("//*");
+ allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
+ } catch (XPathExpressionException e) {
+ throw new IllegalStateException("The hard coded XPath expression should always be valid!");
+ }
+
+ for (int i = 0; i < allElements.getLength(); i++) {
+ if (allElements.item(i) instanceof Element) {
+ Element customElement = (Element) allElements.item(i);
+ elements.add(customElement);
+ }
+ }
+ return elements;
+ }
}