You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/06 15:58:21 UTC

svn commit: r1406164 - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/stanbol/ src/main/java/org/apache/stanbol/enhancer...

Author: rwesten
Date: Tue Nov  6 14:58:20 2012
New Revision: 1406164

URL: http://svn.apache.org/viewvc?rev=1406164&view=rev
Log:
STANBOL-795: implementation of the OpenNLP Tokenizer Engine

Added:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/   (with props)
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/pom.xml
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/metatype.properties

Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Nov  6 14:58:20 2012
@@ -0,0 +1,7 @@
+.settings
+
+target
+
+.classpath
+
+.project

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/pom.xml?rev=1406164&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/pom.xml (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/pom.xml Tue Nov  6 14:58:20 2012
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+    <groupId>org.apache.stanbol</groupId>
+    <version>0.10.0-SNAPSHOT</version>
+    <relativePath>../../parent</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.enhancer.engines.opennlp.token</artifactId>
+  <version>0.10.0-SNAPSHOT</version>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Enhancer Enhancement Engine: OpenNLP tokenizer </name>
+  <description>A Stanbol engine tokenizing the AnalyzedText contentpart for further
+      processing by other engines </description>
+
+  <inceptionYear>2012</inceptionYear>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-token/
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-token/
+    </developerConnection>
+    <url>http://stanbol.apache.org/</url>
+  </scm>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Import-Package>
+              org.apache.stanbol.enhancer.servicesapi; provide:=true,
+              org.apache.stanbol.enhancer.servicesapi.impl; provide:=true,
+              *
+            </Import-Package>
+            <Private-Package>
+              org.apache.stanbol.enhancer.engines.opennlp.token.impl
+            </Private-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <!-- AL20 License -->
+            <exclude>src/license/THIRD-PARTY.properties</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.commons.opennlp</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+      <version>0.10.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.scr.annotations</artifactId>
+    </dependency>
+  </dependencies>
+
+</project>

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java?rev=1406164&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/token/impl/OpenNlpTokenizerEngine.java Tue Nov  6 14:58:20 2012
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.opennlp.token.impl;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText;
+
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A german language POS tagger. Requires that the content item has a text/plain part and a
+ * language id of "de". Adds a POSContentPart to the content item that can be used for further
+ * processing by other modules.
+ * 
+ * @author Sebastian Schaffert
+ */
+
+@Component(immediate = true, metatype = true, 
+    policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration
+@Service
+@Properties(value={
+        @Property(name= EnhancementEngine.PROPERTY_NAME,value="opennlp-token"),
+        @Property(name=OpenNlpTokenizerEngine.CONFIG_LANGUAGES, value = {"*"},cardinality=Integer.MAX_VALUE),
+        @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
+})
+public class OpenNlpTokenizerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
+
+    /**
+     * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+     * are the languages given as default value.
+     */
+    public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.token.languages";
+
+    /**
+     * The parameter name used to configure the name of the OpenNLP model used for pos tagging
+     */
+    private static final String MODEL_NAME_PARAM = "model";
+
+    /**
+     * Configuring {@value #SIMPLE_MODEL_NAME} as value for the {@link #MODEL_NAME_PARAM}
+     * will cause the {@link SimpleTokenizer#INSTANCE} to be used for
+     * Tokenizing the language.<p>
+     * This might be useful to force the usage of this tokenizer even if a
+     * language specific model is available via the {@link OpenNLP} service.
+     */
+    private static final String SIMPLE_MODEL_NAME = "SIMPLE";
+
+    private static final Map<String,Object> SERVICE_PROPERTIES;
+    static {
+        Map<String,Object> props = new HashMap<String,Object>();
+        props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
+            ServiceProperties.ORDERING_NLP_TOKENIZING);
+        props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, 
+            NlpProcessingRole.Tokenizing);
+        SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+    }
+
+
+    private static Logger log = LoggerFactory.getLogger(OpenNlpTokenizerEngine.class);
+
+    //Langauge configuration
+    private LanguageConfiguration languageConfig = new LanguageConfiguration(CONFIG_LANGUAGES,new String[]{"*"});
+
+    @Reference
+    private OpenNLP openNLP;
+    
+    @Reference
+    private AnalysedTextFactory analysedTextFactory;
+    
+    /**
+     * Indicate if this engine can enhance supplied ContentItem, and if it
+     * suggests enhancing it synchronously or asynchronously. The
+     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+     * just a suggestion from the engine.
+     * <p/>
+     * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for
+     * the content item, CANNOT_ENHANCE otherwise.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the introspecting process of the content item
+     *          fails
+     */
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        // check if content is present
+        Map.Entry<UriRef,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false);
+        if(entry == null || entry.getValue() == null) {
+            return CANNOT_ENHANCE;
+        }
+
+        String language = getLanguage(this,ci,false);
+        if(language == null) {
+            return CANNOT_ENHANCE;
+        }
+        if(!languageConfig.isLanguage(language)){
+            log.trace(" > can NOT enhance ContentItem {} because language {} is "
+                + "not enabled by this engines configuration",ci,language);
+            return CANNOT_ENHANCE;
+        }
+        if(getTokenizer(language) == null){
+            log.trace(" > can NOT tokenize plain text of {} because the tokenizer "
+                + "for language {} is not available.",ci,language);
+                return CANNOT_ENHANCE;
+        }
+        log.trace(" > can enhance ContentItem {} with language {}",ci,language);
+        return ENHANCE_ASYNC;
+    }
+
+    /**
+     * Compute enhancements for supplied ContentItem. The results of the process
+     * are expected to be stored in the metadata of the content item.
+     * <p/>
+     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+     * <p/>
+     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
+     * stores it as a new part in the content item. The metadata is not changed.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the underlying process failed to work as
+     *          expected
+     */
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        AnalysedText at = initAnalysedText(this,analysedTextFactory,ci);
+        String language = getLanguage(this, ci, true);
+        
+        Tokenizer tokenizer = getTokenizer(language);
+        if(tokenizer == null){
+            log.warn("Tokenizer for language {} is no longer available. "
+                    + "This might happen if the model becomes unavailable during enhancement. "
+                    + "If this happens more often it might also indicate an bug in the used "
+                    + "EnhancementJobManager implementation as the availability is also checked "
+                    + "in the canEnhance(..) method of this Enhancement Engine.");
+            return;
+        }
+        //Try to use sentences for tokenizing
+        Iterator<? extends Section> sections = at.getSentences();
+        if(!sections.hasNext()){
+            //if no sentences are annotated
+            sections = Collections.singleton(at).iterator();
+        }
+        
+        //for all sentences (or the whole Text - if no sentences available)
+        while(sections.hasNext()){
+            Section section = sections.next();
+            //Tokenize section
+            opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan());
+            for(int i=0;i<tokenSpans.length;i++){
+                Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
+                log.trace(" > add {}",token);
+            }
+        }
+    }
+
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return SERVICE_PROPERTIES;
+    }
+    /**
+     * Activate and read the properties. Configures and initialises a POSTagger for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException {
+        log.info("activating POS tagging engine");
+        super.activate(ce);
+        @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ce.getProperties();
+        languageConfig.setConfiguration(properties);
+    }
+    
+    @Deactivate
+    protected void deactivate(ComponentContext context) {
+        languageConfig.setDefault();
+        super.deactivate(context);
+    }
+    /**
+     * Getter for the Tokenizer. This uses the {@link #languageConfig} to
+     * check if a specific configuration for the given language is present
+     * by checking for the {@link #MODEL_NAME_PARAM}.
+     * @param language the language
+     * @return the {@link Tokenizer} guaranteed to be not <code>null</code>.
+     * @throws EngineException in case a custom configured model is not
+     * available or an error occurred during loading.
+     */
+    private Tokenizer getTokenizer(String language) throws EngineException {
+        String modelName = languageConfig.getParameter(language, MODEL_NAME_PARAM);
+        if(modelName == null){
+            return openNLP.getTokenizer(language);
+        } else if(SIMPLE_MODEL_NAME.equals(modelName)){
+            return SimpleTokenizer.INSTANCE;
+        } else { //try to load the configured model
+            TokenizerModel model;
+            try {
+                model = openNLP.getModel(TokenizerModel.class, modelName, null);
+            } catch (Exception e) {
+                throw new EngineException("Error while loading the configured OpenNLP "
+                    + "TokenizerModel '"+modelName+"' ("+getClass().getSimpleName()+" | name="
+                    + getName() + ")!",e);
+            }
+            if(model == null){
+                throw new EngineException("The configured OpenNLP TokenizerModel '"
+                        + modelName +" is not available' ("+getClass().getSimpleName()
+                        + " | name=" + getName() + ")!");
+            }
+            return new TokenizerME(model);
+        }
+    }
+    
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1406164&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-token/src/main/resources/OSGI-INF/metatype/metatype.properties Tue Nov  6 14:58:20 2012
@@ -0,0 +1,41 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+
+org.apache.stanbol.enhancer.engines.opennlp.token.impl.OpenNlpTokenizerEngine.name=Apache \
+Stanbol Enhancer Engine: OpenNLP Tokenizer
+org.apache.stanbol.enhancer.engines.opennlp.token.impl.OpenNlpTokenizerEngine.description=Enhancement \
+Engine that tokenizes text by using the OpenNLP Tokenizer. It can be configured to use a \
+custom TokenizerModel or the SimpleTokenizer by configuring 'SIMPLE' as modle name.
+
+
+stanbol.enhancer.engine.name.name=Name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+org.apache.stanbol.enhancer.token.languages.name=Language configuration
+org.apache.stanbol.enhancer.token.languages.description=Takes a list of ISO \
+  language codes. '*' is the Wildcard; '!{lang}' to exclude a language; \
+  '{lang};model={tokenizer-model-file-name}' to configure a \
+  custom OpenNLP tokenizer model for a language. The name 'SIMPLE' can be used to \
+  force the usage of the SimpleTokenizer for a lnaugage. \
+  Models are loaded via the Stanbol DataFileProvider service. So users can e.g. \
+  put models in the datafiles directory \
+  (defaults to '{stanbol-working-dir}/stanbol/datafiles')