You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/25 10:59:07 UTC
svn commit: r1127439 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
Author: joern
Date: Wed May 25 08:59:06 2011
New Revision: 1127439
URL: http://svn.apache.org/viewvc?rev=1127439&view=rev
Log:
OPENNLP-17 Initial version of feature generator factory.
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java (with props)
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java?rev=1127439&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java Wed May 25 08:59:06 2011
@@ -0,0 +1,494 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
+
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Creates a set of feature generators based on a provided XML descriptor.
+ *
+ * Example of an XML descriptor:
+ *
+ * <generators>
+ * <charngram min = "2" max = "5"/>
+ * <definition/>
+ * <cache>
+ * <window prevLength = "3" nextLength = "3">
+ * <generators>
+ * <prevmap/>
+ * <sentence/>
+ * <tokenclass/>
+ * <tokenpattern/>
+ * </generators>
+ * </window>
+ * </cache>
+ * </generators>
+ *
+ * Each XML element is mapped to a {@link XmlFeatureGeneratorFactory} which
+ * is responsible to process the element and create the specified
+ * {@link AdaptiveFeatureGenerator}. Elements can contain other
+ * elements in this case it is the responsibility of the mapped factory to process
+ * the child elements correctly. In some factories this leads to recursive
+ * calls the {@link #createGenerator(Element)} method.
+ *
+ * In the example above the generators element is mapped to the
+ * {@link AggregatedFeatureGeneratorFactory} which then
+ * creates all the aggregated {@link AdaptiveFeatureGenerator}s to
+ * accomplish this it evaluates the mapping with the same mechanism
+ * and gives the child element to the corresponding factories. All
+ * created generators are added to a new instance of the
+ * {@link AggregatedFeatureGenerator} which is then returned.
+ */
+public class GeneratorFactory {
+
+ /**
+ * The {@link XmlFeatureGeneratorFactory} is responsible to construct
+ * an {@link AdaptiveFeatureGenerator} from an given XML {@link Element}
+ * which contains all necessary configuration if any.
+ */
+ static interface XmlFeatureGeneratorFactory {
+
+ /**
+ * Creates an {@link AdaptiveFeatureGenerator} from a the describing
+ * XML element.
+ *
+ * @param generatorElement the element which contains the configuration
+ * @param resourceManager the resource manager which could be used
+ * to access referenced resources
+ *
+ * @return the configured {@link AdaptiveFeatureGenerator}
+ */
+ AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException;
+ }
+
+ /**
+ * @see AggregatedFeatureGenerator
+ */
+ static class AggregatedFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ Collection<AdaptiveFeatureGenerator> aggregatedGenerators =
+ new LinkedList<AdaptiveFeatureGenerator>();
+
+ NodeList childNodes = generatorElement.getChildNodes();
+
+ for (int i = 0; i < childNodes.getLength(); i++) {
+ Node childNode = childNodes.item(i);
+
+ if (childNode instanceof Element) {
+ Element aggregatedGeneratorElement = (Element) childNode;
+
+ aggregatedGenerators.add(
+ GeneratorFactory.createGenerator(aggregatedGeneratorElement, resourceManager));
+ }
+ }
+
+ return new AggregatedFeatureGenerator(aggregatedGenerators.toArray(
+ new AdaptiveFeatureGenerator[aggregatedGenerators.size()]));
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("generators", new AggregatedFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see CachedFeatureGenerator
+ */
+ static class CachedFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ private CachedFeatureGeneratorFactory() {
+ }
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ Element cachedGeneratorElement = null;
+
+ NodeList kids = generatorElement.getChildNodes();
+
+ for (int i = 0; i < kids.getLength(); i++) {
+ Node childNode = kids.item(i);
+
+ if (childNode instanceof Element) {
+ cachedGeneratorElement = (Element) childNode;
+ break;
+ }
+ }
+
+ if (cachedGeneratorElement == null) {
+ throw new InvalidFormatException("Could not find containing generator element!");
+ }
+
+ AdaptiveFeatureGenerator chachedGenerator = GeneratorFactory.createGenerator(cachedGeneratorElement, resourceManager);
+
+ return new CachedFeatureGenerator(chachedGenerator);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("cache", new CachedFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see CharacterNgramFeatureGenerator
+ */
+ static class CharacterNgramFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ String minString = generatorElement.getAttribute("min");
+
+ int min;
+
+ try {
+ min = Integer.parseInt(minString);
+ } catch (NumberFormatException e) {
+ throw new InvalidFormatException("min attribute is not a number!");
+ }
+
+ String maxString = generatorElement.getAttribute("max");
+
+ int max;
+
+ try {
+ max = Integer.parseInt(maxString);
+ } catch (NumberFormatException e) {
+ throw new InvalidFormatException("max attribute is not a number!");
+ }
+
+ return new CharacterNgramFeatureGenerator(min, max);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("charngram", new CharacterNgramFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see DefinitionFeatureGenerator
+ */
+ static class DefinitionFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ private static final String ELEMENT_NAME = "definition";
+
+ private DefinitionFeatureGeneratorFactory() {
+ }
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+ return new OutcomePriorFeatureGenerator();
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put(ELEMENT_NAME, new DefinitionFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see DictionaryFeatureGenerator
+ */
+ static class DictionaryFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ String dictResourceKey = generatorElement.getAttribute("dict");
+
+ Object dictResource = resourceManager.getResource(dictResourceKey);
+
+ if (!(dictResource instanceof Dictionary)) {
+ throw new InvalidFormatException("No dictionary resource for key: " + dictResourceKey);
+ }
+
+ String prefix = generatorElement.getAttribute("prefix");
+
+ return new DictionaryFeatureGenerator(prefix, (Dictionary) dictResource);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("dictionary", new DictionaryFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see PreviousMapFeatureGenerator
+ */
+ static class PreviousMapFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+ return new PreviousMapFeatureGenerator();
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("prevmap", new PreviousMapFeatureGeneratorFactory());
+ }
+ }
+
+ // TODO: Add parameters ...
+
+ /**
+ * @see SentenceFeatureGenerator
+ */
+ static class SentenceFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+
+ String beginFeatureString = generatorElement.getAttribute("begin");
+
+ boolean beginFeature = true;
+ if (!beginFeatureString.isEmpty())
+ beginFeature = Boolean.parseBoolean(beginFeatureString);
+
+ String endFeatureString = generatorElement.getAttribute("end");
+ boolean endFeature = true;
+ if (!endFeatureString.isEmpty())
+ endFeature = Boolean.parseBoolean(endFeatureString);
+
+ return new SentenceFeatureGenerator(beginFeature, endFeature);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("sentence", new SentenceFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see TokenClassFeatureGenerator
+ */
+ static class TokenClassFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+ // TODO: Make it configurable ...
+ return new TokenClassFeatureGenerator(true);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("tokenclass", new TokenClassFeatureGeneratorFactory());
+ }
+ }
+
+ static class TokenFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+
+ return new TokenFeatureGenerator();
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("token", new TokenPatternFeatureGeneratorFactory());
+ }
+ }
+
+ static class BigramNameFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+
+ return new BigramNameFeatureGenerator();
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("bigram", new BigramNameFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see TokenPatternFeatureGenerator
+ */
+ static class TokenPatternFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) {
+ return new TokenPatternFeatureGenerator();
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("tokenpattern", new TokenPatternFeatureGeneratorFactory());
+ }
+ }
+
+ /**
+ * @see WindowFeatureGenerator
+ */
+ static class WindowFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+ public AdaptiveFeatureGenerator create(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ Element nestedGeneratorElement = null;
+
+ NodeList kids = generatorElement.getChildNodes();
+
+ for (int i = 0; i < kids.getLength(); i++) {
+ Node childNode = kids.item(i);
+
+ if (childNode instanceof Element) {
+ nestedGeneratorElement = (Element) childNode;
+ break;
+ }
+ }
+
+ if (nestedGeneratorElement == null) {
+ throw new InvalidFormatException("window feature generator must contain" +
+ "a agregator element");
+ }
+
+ AdaptiveFeatureGenerator nestedGenerator = GeneratorFactory.createGenerator(nestedGeneratorElement, resourceManager);
+
+ String prevLengthString = generatorElement.getAttribute("prevLength");
+
+ int prevLength;
+
+ try {
+ prevLength = Integer.parseInt(prevLengthString);
+ } catch (NumberFormatException e) {
+ throw new InvalidFormatException("prevLength attribute is not a number!");
+ }
+
+ String nextLengthString = generatorElement.getAttribute("nextLength");
+
+ int nextLength;
+
+ try {
+ nextLength = Integer.parseInt(nextLengthString);
+ } catch (NumberFormatException e) {
+ throw new InvalidFormatException("nextLength attribute is not a number!");
+ }
+
+ return new WindowFeatureGenerator(nestedGenerator, prevLength, nextLength);
+ }
+
+ static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+ factoryMap.put("window", new WindowFeatureGeneratorFactory());
+ }
+ }
+
+ private static Map<String, XmlFeatureGeneratorFactory> factories =
+ new HashMap<String, XmlFeatureGeneratorFactory>();
+
+ static {
+ AggregatedFeatureGeneratorFactory.register(factories);
+ CachedFeatureGeneratorFactory.register(factories);
+ CharacterNgramFeatureGeneratorFactory.register(factories);
+ DefinitionFeatureGeneratorFactory.register(factories);
+ DictionaryFeatureGeneratorFactory.register(factories);
+ PreviousMapFeatureGeneratorFactory.register(factories);
+ SentenceFeatureGeneratorFactory.register(factories);
+ TokenClassFeatureGeneratorFactory.register(factories);
+ TokenFeatureGeneratorFactory.register(factories);
+ BigramNameFeatureGeneratorFactory.register(factories);
+ TokenPatternFeatureGeneratorFactory.register(factories);
+ WindowFeatureGeneratorFactory.register(factories);
+ }
+
+ /**
+ * Creates a {@link AdaptiveFeatureGenerator} for the provided element.
+ * To accomplish this it looks up the corresponding factory by the
+ * element tag name. The factory is then responsible for the creation
+ * of the generator from the element.
+ *
+ * @param generatorElement
+ * @param resourceManager
+ *
+ * @return
+ */
+ static AdaptiveFeatureGenerator createGenerator(Element generatorElement,
+ FeatureGeneratorResourceProvider resourceManager) throws InvalidFormatException {
+
+ String elementName = generatorElement.getTagName();
+
+ XmlFeatureGeneratorFactory generatorFactory = factories.get(elementName);
+
+ if (generatorFactory == null) {
+ throw new InvalidFormatException("Unexpected element: " + elementName);
+ }
+
+ return generatorFactory.create(generatorElement, resourceManager);
+ }
+
+ /**
+ * Creates an {@link AdaptiveFeatureGenerator} from an provided XML descriptor.
+ *
+ * Usually this XML descriptor contains a set of nested feature generators
+ * which are then used to generate the features by one of the opennlp
+ * components.
+ *
+ * @param xmlDescriptorIn the {@link InputStream} from which the descriptor
+ * is read, the stream remains open and must be closed by the caller.
+ *
+ * @param resourceManager the resource manager which is used to resolve resources
+ * referenced by a key in the descriptor
+ *
+ * @return
+ *
+ * @throws IOException if an error occurs during reading from the descriptor
+ * {@link InputStream}
+ */
+ public static AdaptiveFeatureGenerator create(InputStream xmlDescriptorIn,
+ FeatureGeneratorResourceProvider resourceManager) throws IOException, InvalidFormatException {
+
+ DocumentBuilderFactory documentBuilderFacoty = DocumentBuilderFactory.newInstance();
+
+ DocumentBuilder documentBuilder;
+
+ try {
+ documentBuilder = documentBuilderFacoty.newDocumentBuilder();
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ documentBuilder = null;
+ }
+
+ org.w3c.dom.Document xmlDescriptorDOM;
+
+ try {
+ xmlDescriptorDOM = documentBuilder.parse(xmlDescriptorIn);
+ } catch (SAXException e) {
+ throw new InvalidFormatException("Descriptor is not valid XML!", e);
+ }
+
+ Element generatorElement = xmlDescriptorDOM.getDocumentElement();
+
+ return createGenerator(generatorElement, resourceManager);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain