You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ra...@apache.org on 2014/10/27 19:13:06 UTC

svn commit: r1634634 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/namefind/TokenNameFinderTrainerTool.java namefind/TokenNameFinderModel.java util/featuregen/GeneratorFactory.java

Author: ragerri
Date: Mon Oct 27 18:13:06 2014
New Revision: 1634634

URL: http://svn.apache.org/r1634634
Log:
OPENNLP-725 now the serializer is chosen from dict attribute and element tag

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Mon Oct 27 18:13:06 2014
@@ -20,7 +20,9 @@ package opennlp.tools.cmdline.namefind;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 import opennlp.tools.cmdline.AbstractTrainerTool;
@@ -34,13 +36,14 @@ import opennlp.tools.namefind.NameSample
 import opennlp.tools.namefind.NameSampleTypeFilter;
 import opennlp.tools.namefind.TokenNameFinderFactory;
 import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.SequenceCodec;
 import opennlp.tools.util.featuregen.GeneratorFactory;
 import opennlp.tools.util.model.ArtifactSerializer;
 import opennlp.tools.util.model.ModelUtil;
 
+import org.w3c.dom.Element;
+
 public final class TokenNameFinderTrainerTool
     extends AbstractTrainerTool<NameSample, TrainerToolParams> {
 
@@ -92,6 +95,8 @@ public final class TokenNameFinderTraine
 
       Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
           .createArtifactSerializers();
+      List<Element> elements = new ArrayList<Element>();
+      ArtifactSerializer serializer = null;
 
 
       // TODO: If there is descriptor file, it should be consulted too
@@ -105,38 +110,34 @@ public final class TokenNameFinderTraine
           // TODO: Improve error handling!
           e.printStackTrace();
         }
+        InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor);
+        try {
+          elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
+        } catch (IOException e) {
+          e.printStackTrace();
+        }
       }
 
       File resourceFiles[] = resourcePath.listFiles();
-
-      // TODO: Filter files, also files with start with a dot
+      
       for (File resourceFile : resourceFiles) {
-
-        // TODO: Move extension extracting code to method and
-        // write unit test for it
-
-        // extract file ending
         String resourceName = resourceFile.getName();
-
-        int lastDot = resourceName.lastIndexOf('.');
-
-        if (lastDot == -1) {
-          continue;
+        //gettting the serializer key from the element tag name
+        //if the element contains a dict attribute
+        for (Element xmlElement : elements) {
+          String dictName = xmlElement.getAttribute("dict");
+          if (dictName != null && dictName.equals(resourceName)) {
+            serializer = artifactSerializers.get(xmlElement.getTagName());
+          }
         }
-
-        String ending = resourceName.substring(lastDot + 1);
-
-        // lookup serializer from map
-        ArtifactSerializer serializer = artifactSerializers.get(ending);
-
         // TODO: Do different? For now just ignore ....
         if (serializer == null)
           continue;
 
-        InputStream resoruceIn = CmdLineUtil.openInFile(resourceFile);
+        InputStream resourceIn = CmdLineUtil.openInFile(resourceFile);
 
         try {
-          resources.put(resourceName, serializer.create(resoruceIn));
+          resources.put(resourceName, serializer.create(resourceIn));
         } catch (InvalidFormatException e) {
           // TODO: Fix exception handling
           e.printStackTrace();
@@ -145,7 +146,7 @@ public final class TokenNameFinderTraine
           e.printStackTrace();
         } finally {
           try {
-            resoruceIn.close();
+            resourceIn.close();
           } catch (IOException e) {
           }
         }

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java Mon Oct 27 18:13:06 2014
@@ -263,7 +263,7 @@ public class TokenNameFinderModel extend
     Map<String, ArtifactSerializer> serializers = BaseModel.createArtifactSerializers();
 
     serializers.put("featuregen", new ByteArraySerializer());
-    serializers.put("w2vclasses", new W2VClassesDictionary.W2VClassesDictionarySerializer());
+    serializers.put("w2vwordcluster", new W2VClassesDictionary.W2VClassesDictionarySerializer());
 
     return serializers;
   }

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java?rev=1634634&r1=1634633&r2=1634634&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java Mon Oct 27 18:13:06 2014
@@ -19,10 +19,12 @@ package opennlp.tools.util.featuregen;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
 
 import javax.xml.namespace.QName;
@@ -667,4 +669,28 @@ public class GeneratorFactory {
     }
     return mapping;
   }
+  
+  public static List<Element> getDescriptorElements(
+      InputStream xmlDescriptorIn)
+      throws IOException, InvalidFormatException {
+    
+    List<Element> elements = new ArrayList<Element>();
+    org.w3c.dom.Document xmlDescriptorDOM = createDOM(xmlDescriptorIn);
+    XPath xPath = XPathFactory.newInstance().newXPath();
+    NodeList allElements;
+    try {
+      XPathExpression exp = xPath.compile("//*");
+      allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
+    } catch (XPathExpressionException e) {
+      throw new IllegalStateException("The hard coded XPath expression should always be valid!");
+    }
+
+    for (int i = 0; i < allElements.getLength(); i++) {
+      if (allElements.item(i) instanceof Element) {
+        Element customElement = (Element) allElements.item(i);
+        elements.add(customElement);
+        }
+      }
+    return elements;
+  }
 }