You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/25 10:59:07 UTC

svn commit: r1127439 - /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java

Author: joern
Date: Wed May 25 08:59:06 2011
New Revision: 1127439

URL: http://svn.apache.org/viewvc?rev=1127439&view=rev
Log:
OPENNLP-17 Initial version of feature generator factory.

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java   (with props)

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java?rev=1127439&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java Wed May 25 08:59:06 2011
@@ -0,0 +1,494 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Creates a set of feature generators based on a provided XML descriptor.
+ *
+ * Example of an XML descriptor:
+ *
+ * <generators>
+ *   <charngram min = "2" max = "5"/>
+ *   <definition/>
+ *   <cache>
+ *     <window prevLength = "3" nextLength = "3">
+ *       <generators>
+ *         <prevmap/>
+ *         <sentence/>
+ *         <tokenclass/>
+ *         <tokenpattern/>
+ *       </generators>
+ *     </window>
+ *   </cache>
+ * </generators>
+ *
+ * Each XML element is mapped to a {@link XmlFeatureGeneratorFactory} which
+ * is responsible to process the element and create the specified
+ * {@link AdaptiveFeatureGenerator}. Elements can contain other
+ * elements in this case it is the responsibility of the mapped factory to process
+ * the child elements correctly. In some factories this leads to recursive
+ * calls the {@link #createGenerator(Element)} method.
+ *
+ * In the example above the generators element is mapped to the
+ * {@link AggregatedFeatureGeneratorFactory} which then
+ * creates all the aggregated {@link AdaptiveFeatureGenerator}s to
+ * accomplish this it evaluates the mapping with the same mechanism
+ * and gives the child element to the corresponding factories. All
+ * created generators are added to a new instance of the
+ * {@link AggregatedFeatureGenerator} which is then returned.
+ */
+public class GeneratorFactory {
+
+  /**
+   * The {@link XmlFeatureGeneratorFactory} is responsible to construct
+   * an {@link AdaptiveFeatureGenerator} from an given XML {@link Element}
+   * which contains all necessary configuration if any.
+   */
+  static interface XmlFeatureGeneratorFactory {
+
+    /**
+     * Creates an {@link AdaptiveFeatureGenerator} from a the describing
+     * XML element.
+     *
+     * @param generatorElement the element which contains the configuration
+     * @param resourceManager the resource manager which could be used
+     *     to access referenced resources
+     *
+     * @return the configured {@link AdaptiveFeatureGenerator}
+     */
+    AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException;
+  }
+
+  /**
+   * @see AggregatedFeatureGenerator
+   */
+  static class AggregatedFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager)  throws InvalidFormatException {
+
+      Collection<AdaptiveFeatureGenerator> aggregatedGenerators =
+          new LinkedList<AdaptiveFeatureGenerator>();
+
+      NodeList childNodes = generatorElement.getChildNodes();
+
+      for (int i = 0; i < childNodes.getLength(); i++) {
+        Node childNode = childNodes.item(i);
+
+        if (childNode instanceof Element) {
+          Element aggregatedGeneratorElement = (Element) childNode;
+
+          aggregatedGenerators.add(
+              GeneratorFactory.createGenerator(aggregatedGeneratorElement, resourceManager));
+        }
+      }
+
+      return new AggregatedFeatureGenerator(aggregatedGenerators.toArray(
+              new AdaptiveFeatureGenerator[aggregatedGenerators.size()]));
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("generators", new AggregatedFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see CachedFeatureGenerator
+   */
+  static class CachedFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    private CachedFeatureGeneratorFactory() {
+    }
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+      Element cachedGeneratorElement = null;
+
+      NodeList kids = generatorElement.getChildNodes();
+
+      for (int i = 0; i < kids.getLength(); i++) {
+        Node childNode = kids.item(i);
+
+        if (childNode instanceof Element) {
+          cachedGeneratorElement = (Element) childNode;
+          break;
+        }
+      }
+
+      if (cachedGeneratorElement == null) {
+        throw new InvalidFormatException("Could not find containing generator element!");
+      }
+
+      AdaptiveFeatureGenerator chachedGenerator = GeneratorFactory.createGenerator(cachedGeneratorElement, resourceManager);
+
+      return new CachedFeatureGenerator(chachedGenerator);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("cache", new CachedFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see CharacterNgramFeatureGenerator
+   */
+  static class CharacterNgramFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+      String minString = generatorElement.getAttribute("min");
+
+      int min;
+
+      try {
+        min = Integer.parseInt(minString);
+      } catch (NumberFormatException e) {
+        throw new InvalidFormatException("min attribute is not a number!");
+      }
+
+      String maxString = generatorElement.getAttribute("max");
+
+      int max;
+
+      try {
+        max = Integer.parseInt(maxString);
+      } catch (NumberFormatException e) {
+        throw new InvalidFormatException("max attribute is not a number!");
+      }
+
+      return new CharacterNgramFeatureGenerator(min, max);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("charngram", new CharacterNgramFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see DefinitionFeatureGenerator
+   */
+  static class DefinitionFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    private static final String ELEMENT_NAME = "definition";
+
+    private DefinitionFeatureGeneratorFactory() {
+    }
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+      return new OutcomePriorFeatureGenerator();
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put(ELEMENT_NAME, new DefinitionFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see DictionaryFeatureGenerator
+   */
+  static class DictionaryFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+      
+      String dictResourceKey = generatorElement.getAttribute("dict");
+      
+      Object dictResource = resourceManager.getResource(dictResourceKey);
+      
+      if (!(dictResource instanceof Dictionary)) {
+        throw new InvalidFormatException("No dictionary resource for key: " + dictResourceKey);
+      }
+
+      String prefix = generatorElement.getAttribute("prefix");
+      
+      return new DictionaryFeatureGenerator(prefix, (Dictionary) dictResource);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("dictionary", new DictionaryFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see PreviousMapFeatureGenerator
+   */
+  static class PreviousMapFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      return new PreviousMapFeatureGenerator();
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("prevmap", new PreviousMapFeatureGeneratorFactory());
+    }
+  }
+
+  // TODO: Add parameters ... 
+  
+  /**
+   * @see SentenceFeatureGenerator
+   */
+  static class SentenceFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      
+      String beginFeatureString = generatorElement.getAttribute("begin");
+      
+      boolean beginFeature = true;
+      if (!beginFeatureString.isEmpty())
+        beginFeature = Boolean.parseBoolean(beginFeatureString);
+        
+      String endFeatureString = generatorElement.getAttribute("end");
+      boolean endFeature = true;
+      if (!endFeatureString.isEmpty())
+        endFeature = Boolean.parseBoolean(endFeatureString);
+      
+      return new SentenceFeatureGenerator(beginFeature, endFeature);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("sentence", new SentenceFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see TokenClassFeatureGenerator
+   */
+  static class TokenClassFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      // TODO: Make it configurable ...
+      return new TokenClassFeatureGenerator(true);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("tokenclass", new TokenClassFeatureGeneratorFactory());
+    }
+  }
+
+  static class TokenFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      
+      return new TokenFeatureGenerator();
+    }
+    
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("token", new TokenPatternFeatureGeneratorFactory());
+    }
+  }
+  
+  static class BigramNameFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+    
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      
+      return new BigramNameFeatureGenerator();
+    }
+    
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("bigram", new BigramNameFeatureGeneratorFactory());
+    }
+  }
+  
+  /**
+   * @see TokenPatternFeatureGenerator
+   */
+  static class TokenPatternFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager) {
+      return new TokenPatternFeatureGenerator();
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("tokenpattern", new TokenPatternFeatureGeneratorFactory());
+    }
+  }
+
+  /**
+   * @see WindowFeatureGenerator
+   */
+  static class WindowFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+        FeatureGeneratorResourceProvider resourceManager)  throws InvalidFormatException {
+
+      Element nestedGeneratorElement = null;
+
+      NodeList kids = generatorElement.getChildNodes();
+
+      for (int i = 0; i < kids.getLength(); i++) {
+        Node childNode = kids.item(i);
+
+        if (childNode instanceof Element) {
+          nestedGeneratorElement = (Element) childNode;
+          break;
+        }
+      }
+
+      if (nestedGeneratorElement == null) {
+        throw new InvalidFormatException("window feature generator must contain" +
+        		"a agregator element");
+      }
+      
+      AdaptiveFeatureGenerator nestedGenerator = GeneratorFactory.createGenerator(nestedGeneratorElement, resourceManager);
+      
+      String prevLengthString = generatorElement.getAttribute("prevLength");
+
+      int prevLength;
+
+      try {
+        prevLength = Integer.parseInt(prevLengthString);
+      } catch (NumberFormatException e) {
+        throw new InvalidFormatException("prevLength attribute is not a number!");
+      }
+      
+      String nextLengthString = generatorElement.getAttribute("nextLength");
+
+      int nextLength;
+
+      try {
+        nextLength = Integer.parseInt(nextLengthString);
+      } catch (NumberFormatException e) {
+        throw new InvalidFormatException("nextLength attribute is not a number!");
+      }  
+      
+      return new WindowFeatureGenerator(nestedGenerator, prevLength, nextLength);
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("window", new WindowFeatureGeneratorFactory());
+    }
+  }
+
+  private static Map<String, XmlFeatureGeneratorFactory> factories =
+      new HashMap<String, XmlFeatureGeneratorFactory>();
+
+  static {
+    AggregatedFeatureGeneratorFactory.register(factories);
+    CachedFeatureGeneratorFactory.register(factories);
+    CharacterNgramFeatureGeneratorFactory.register(factories);
+    DefinitionFeatureGeneratorFactory.register(factories);
+    DictionaryFeatureGeneratorFactory.register(factories);
+    PreviousMapFeatureGeneratorFactory.register(factories);
+    SentenceFeatureGeneratorFactory.register(factories);
+    TokenClassFeatureGeneratorFactory.register(factories);
+    TokenFeatureGeneratorFactory.register(factories);
+    BigramNameFeatureGeneratorFactory.register(factories);
+    TokenPatternFeatureGeneratorFactory.register(factories);
+    WindowFeatureGeneratorFactory.register(factories);
+  }
+
+  /**
+   * Creates a {@link AdaptiveFeatureGenerator} for the provided element.
+   * To accomplish this it looks up the corresponding factory by the
+   * element tag name. The factory is then responsible for the creation
+   * of the generator from the element.
+   *
+   * @param generatorElement
+   * @param resourceManager
+   *
+   * @return
+   */
+  static AdaptiveFeatureGenerator createGenerator(Element generatorElement,
+      FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+    String elementName = generatorElement.getTagName();
+    
+    XmlFeatureGeneratorFactory generatorFactory = factories.get(elementName);
+
+    if (generatorFactory == null) {
+      throw new InvalidFormatException("Unexpected element: " + elementName);
+    }
+    
+    return generatorFactory.create(generatorElement, resourceManager);
+  }
+
+  /**
+   * Creates an {@link AdaptiveFeatureGenerator} from an provided XML descriptor.
+   *
+   * Usually this XML descriptor contains a set of nested feature generators
+   * which are then used to generate the features by one of the opennlp
+   * components.
+   *
+   * @param xmlDescriptorIn the {@link InputStream} from which the descriptor
+   * is read, the stream remains open and must be closed by the caller.
+   *
+   * @param resourceManager the resource manager which is used to resolve resources
+   * referenced by a key in the descriptor
+   *
+   * @return
+   *
+   * @throws IOException if an error occurs during reading from the descriptor
+   *     {@link InputStream}
+   */
+  public static AdaptiveFeatureGenerator create(InputStream xmlDescriptorIn,
+      FeatureGeneratorResourceProvider resourceManager) throws IOException, InvalidFormatException {
+
+    DocumentBuilderFactory documentBuilderFacoty = DocumentBuilderFactory.newInstance();
+
+    DocumentBuilder documentBuilder;
+
+    try {
+      documentBuilder = documentBuilderFacoty.newDocumentBuilder();
+    } catch (ParserConfigurationException e) {
+      e.printStackTrace();
+      documentBuilder = null;
+    }
+
+    org.w3c.dom.Document xmlDescriptorDOM;
+
+    try {
+      xmlDescriptorDOM = documentBuilder.parse(xmlDescriptorIn);
+    } catch (SAXException e) {
+      throw new InvalidFormatException("Descriptor is not valid XML!", e);
+    }
+
+    Element generatorElement = xmlDescriptorDOM.getDocumentElement();
+
+    return createGenerator(generatorElement, resourceManager);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain