You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/09 16:37:17 UTC
svn commit: r1556839 - in /stanbol/trunk: enhancement-engines/ enhancement-engines/pos-chunker/ enhancement-engines/pos-chunker/src/ enhancement-engines/pos-chunker/src/main/ enhancement-engines/pos-chunker/src/main/java/ enhancement-engines/pos-chunke...

Author: rwesten
Date: Thu Jan  9 15:37:16 2014
New Revision: 1556839

URL: http://svn.apache.org/r1556839
Log:
STANBOL-1251: first version of the Pos-Chunker engine

Added:
    stanbol/trunk/enhancement-engines/pos-chunker/   (with props)
    stanbol/trunk/enhancement-engines/pos-chunker/pom.xml
    stanbol/trunk/enhancement-engines/pos-chunker/src/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/
    stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties
Modified:
    stanbol/trunk/enhancement-engines/pom.xml
    stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml

Modified: stanbol/trunk/enhancement-engines/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pom.xml?rev=1556839&r1=1556838&r2=1556839&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/pom.xml Thu Jan  9 15:37:16 2014
@@ -60,6 +60,7 @@
 	  <module>langdetect</module>
     <module>langid</module>
     <module>opennlp</module>
+    <module>pos-chunker</module>
 
     <!-- Chinese language support -->
     <module>smartcn-token</module> <!-- sentence detection and tokenizing -->

Propchange: stanbol/trunk/enhancement-engines/pos-chunker/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jan  9 15:37:16 2014
@@ -0,0 +1,7 @@
+.settings
+
+.project
+
+.classpath
+
+target

Added: stanbol/trunk/enhancement-engines/pos-chunker/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/pom.xml?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/pom.xml (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/pom.xml Thu Jan  9 15:37:16 2014
@@ -0,0 +1,104 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>apache-stanbol-enhancement-engines</artifactId>
+    <version>1.0.0-SNAPSHOT</version>
+    <relativePath>..</relativePath>
+  </parent>
+
+  <artifactId>org.apache.stanbol.enhancer.engines.poschunker</artifactId>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Enhancement Engine : POS tag based Chunker</name>
+  <description>
+	    Uses POS tag information of Tokens to create Noun and Verb phrases.
+	</description>
+
+  <inceptionYear>2014</inceptionYear>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/opennlp/pos-chunker
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/opennlp/pos-chunker
+    </developerConnection>
+    <url>http://stanbol.apache.org/</url>
+  </scm>
+
+  <properties>
+    <opennlp.model.path>org/apache/stanbol/data/opennlp</opennlp.model.path>
+  </properties>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Import-Package>
+              org.apache.stanbol.enhancer.servicesapi; provide:=true; version="[0.11,1.1)",
+              org.apache.stanbol.enhancer.servicesapi.impl; provide:=true; version="[0.11,1.1)",
+              *
+            </Import-Package>
+            <!-- Export the PhraseBuilder and PhraseTypeDefinition -->
+            <Export-Package>
+              org.apache.stanbol.enhancer.engines.poschunker;version=${project.version}
+            </Export-Package>
+            <!-- Keep Engine private as it is used as a service -->
+            <Private-Package>
+              org.apache.stanbol.enhancer.engines.poschunker.engine;version=${project.version}
+            </Private-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <!-- AL20 License -->
+            <exclude>src/license/THIRD-PARTY.properties</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+      <version>1.0.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+      <version>1.0.0-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.scr.annotations</artifactId>
+    </dependency>
+  </dependencies>
+
+</project>

Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Thu Jan  9 15:37:16 2014
@@ -0,0 +1,203 @@
+package org.apache.stanbol.enhancer.engines.poschunker;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+public class PhraseBuilder {
+    
+    /**
+     * Just a fallback in case Pos annotations do not provide probabilities. 
+     * In most cases the value of this will not have any effect as typically 
+     * Pos Taggers that do not provide probabilities only emit a
+     * single POS tag per Token. In such cases this tag will be always accepted 
+     * regardless of the configured value. <p>
+     * The value is only important if some Pos annotation for a Token do have 
+     * probabilities while others have not. In such cases those without are rated 
+     * against other that have by using this value. Such Situations should only
+     * occur if a chain uses several POS taggers - a setting that should be
+     * avoided<p>
+     */
+    private static final double DEFAULT_SCORE = 0.1;
+    
+    private final PhraseTypeDefinition phraseType;
+    
+    private final ChunkFactory chunkFactory;
+    
+    private final double minPosSocre;
+    /**
+     * The {@link PhraseTag} added to all {@link Chunk}s created by this
+     * {@link PhraseBuilder}
+     */
+    private final PhraseTag phraseTag;
+        
+    /**
+     * Holds Tokens of a current phrase. Empty if no phrase is building.
+     */
+    private List<Token> current = new ArrayList<Token>();
+    /**
+     * If {@link #current} contains a Tokens matching 
+     * {@link PhraseTypeDefinition#getRequiredType()}
+     */
+    boolean valid;
+    
+    public PhraseBuilder(PhraseTypeDefinition phraseType, ChunkFactory chunkFactory, double minPosSocre) {
+        if(phraseType == null){
+            throw new IllegalArgumentException("The parsed PhraseTypeDefinition MUST NOT be NULL!");
+        }
+        this.phraseType = phraseType;
+        this.phraseTag = new PhraseTag(phraseType.getPhraseType().name(), 
+            phraseType.getPhraseType());
+        if(chunkFactory == null){
+            throw new IllegalArgumentException("The parsed ChunkFactory MUST NOT be NULL");
+        }
+        this.chunkFactory = chunkFactory;
+        if(minPosSocre < 0 || minPosSocre > 1){
+            throw new IllegalArgumentException("The parsed minPosScore '" + minPosSocre 
+                + "' MUST BE within the ranve [0..1]!");
+        }
+        this.minPosSocre = minPosSocre;
+    }
+    
+    
+    public void nextToken(Token token){
+        if(current.isEmpty()){ //check for start
+            checkStart(token);
+        } else if(!checkContinuation(token)){ //check for continuation
+            buildPhrase(token);
+        }
+        
+    }
+    
+    public void nextSection(Section section){
+        buildPhrase(null);
+    }
+    
+
+    @SuppressWarnings("unchecked") //varargs with generic types
+    private void checkStart(Token token){
+        boolean[] states = checkCategories(token, phraseType.getStartType(), 
+            phraseType.getRequiredType());
+        if(states[0]){
+            current.add(token);
+            valid = states[1];
+        }
+    }
+
+    @SuppressWarnings("unchecked") //varargs with generic types
+    private boolean checkContinuation(Token token){
+        final boolean[] states;
+        if(!valid){
+            states = checkCategories(token, phraseType.getContinuationType(),
+                phraseType.getRequiredType());
+        } else {
+            states = checkCategories(token, phraseType.getContinuationType());
+        }
+        if(states[0]){
+            current.add(token);
+        }
+        if(states.length > 1){
+            valid = states[1];
+        }
+        return states[0];
+    }
+    
+    @SuppressWarnings("unchecked") //varargs with generic types
+    private void buildPhrase(Token token) {
+        Token lastConsumedToken = null;
+        if(valid){
+            //search backwards for the first token matching an allowed end
+            //category
+            int endIndex = current.size()-1;
+            while(endIndex > 0 && !checkCategories(current.get(endIndex), 
+                phraseType.getEndType())[0]){
+                endIndex--;
+            }
+            lastConsumedToken = current.get(endIndex);
+            //NOTE: ignore phrases with a single token
+            if(endIndex > 0){
+                Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
+                //TODO: add support for confidence
+                chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
+            }
+        }
+        //cleanup
+        current.clear();
+        valid = false;
+        if(token != null && !token.equals(lastConsumedToken)){
+            //the current token might be the start of a new phrase
+            checkStart(token);
+        }
+    }
+    
+    /**
+     * Checks if the a the {@link NlpAnnotations#POS_ANNOTATION POS Annotations}
+     * of a {@link Token} matches the parsed categories. This method supports
+     * to check against multiple sets of categories to allow checking e.g. if a token
+     * is suitable for {@link PhraseTypeDefinition#getStartType()} and
+     * {@link PhraseTypeDefinition#getRequiredType()}.
+     * @param token the Token
+     * @param categories the list of categories to check
+     * @return if the sum of matching annotations compared to the score of all
+     * POS annotations is higher or equals the configured {@link #minPosSocre}.
+     * For each parsed categories set a boolean state is returned.
+     */
+    private boolean[] checkCategories(Token token, Set<LexicalCategory>...categories) {
+        //there are different ways NLP frameworks do assign scores. For some the
+        //sum of all categories would sum up to 1.0, but as only the top three
+        //categories are included the sum would be < 1
+        //Others assign scores so that each score is < 1, but the sum of all
+        //is higher as 1.0.
+        //There is also the possibility that no scores are present.
+        
+        //Because of that this sums up all scores and normalizes with the 
+        //Match.max(1.0,sumScore).
+        //POS tags without score are assigned a #DEFAULT_SCORE. If not a single
+        //POS tag with a score is present the sumScore is NOT normalized to 1.0
+        boolean scorePresent = false;
+        double sumScore = 0;
+        double[] matchScores = new double[categories.length];
+        for(Value<PosTag> pos : token.getAnnotations(POS_ANNOTATION)){
+            double score = pos.probability();
+            if(score == Value.UNKNOWN_PROBABILITY){
+                score = DEFAULT_SCORE;
+            } else {
+                scorePresent = true;
+            }
+            sumScore = sumScore + pos.probability();
+            Set<LexicalCategory> tokenCategories = pos.value().getCategories();
+            for(int i = 0; i < categories.length; i++){
+                Set<LexicalCategory> category = categories[i];
+                if(!Collections.disjoint(tokenCategories, category)){
+                    matchScores[i] = matchScores[i] + pos.probability();
+                }
+            }
+        }
+        boolean[] matches = new boolean[matchScores.length];
+        //the score used to normalize annotations. See comments at method start
+        double normScore = scorePresent ? Math.max(1.0,sumScore) : sumScore;
+        for(int i = 0; i < matchScores.length ; i++){
+            matches[i] = matchScores[i]/normScore >= minPosSocre;
+        }
+        return matches;
+    }
+
+    public static interface ChunkFactory {
+        
+        Chunk createChunk(Token start, Token end);
+    }
+    
+}

Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Thu Jan  9 15:37:16 2014
@@ -0,0 +1,138 @@
+package org.apache.stanbol.enhancer.engines.poschunker;
+
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+
+public class PhraseTypeDefinition {
+
+    protected final LexicalCategory phraseType;
+    
+    private final Set<LexicalCategory> startTypes;
+    protected final Set<LexicalCategory> readOnlyStartTypes;
+    private final Set<LexicalCategory> continuationTypes;
+    protected final Set<LexicalCategory> readOnlyContinuationTypes;
+    private final Set<LexicalCategory> requiredTypes;
+    protected final Set<LexicalCategory> readOnlyRequiredTypes;
+    private final Set<LexicalCategory> endTypes;
+    protected final Set<LexicalCategory> readOnlyEndTypes;
+    
+    public PhraseTypeDefinition(LexicalCategory phraseType) {
+        if(phraseType == null){
+            throw new IllegalArgumentException("The parsed PhraseType MUST NOT be NULL!");
+        }
+        this.phraseType = phraseType;
+        startTypes = EnumSet.of(phraseType);
+        readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
+        continuationTypes = EnumSet.of(phraseType);
+        readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
+        requiredTypes = EnumSet.of(phraseType);
+        readOnlyRequiredTypes = Collections.unmodifiableSet(requiredTypes);
+        endTypes = EnumSet.of(phraseType);
+        readOnlyEndTypes = Collections.unmodifiableSet(startTypes);
+    }
+    
+    public boolean addStartType(LexicalCategory...types){
+        return add(startTypes,types);
+    }
+    
+    public boolean addContinuationType(LexicalCategory...types){
+        return add(continuationTypes,types);
+    }
+    
+    public boolean addRequiredType(LexicalCategory...types){
+        return add(requiredTypes,types);
+    }
+    public boolean addEndType(LexicalCategory...types){
+        return add(endTypes,types);
+    }
+    
+    public boolean removeStartType(LexicalCategory...types){
+        return remove(startTypes,types);
+    }
+    
+    public boolean removeContinuationType(LexicalCategory...types){
+        return remove(continuationTypes,types);
+    }
+    
+    public boolean removeRequiredType(LexicalCategory...types){
+        return remove(requiredTypes,types);
+    }
+
+    public boolean removeEndType(LexicalCategory...types){
+        return remove(endTypes,types);
+    }
+    /**
+     * Getter for the type of this phrase definition
+     * @return
+     */
+    public LexicalCategory getPhraseType(){
+        return phraseType;
+    }
+    
+    /**
+     * Getter for the read only set with the start types
+     * @return the read only set with {@link LexicalCategory LexicalCategories}
+     * that can start a phrase of that type
+     */
+    public Set<LexicalCategory> getStartType(){
+        return readOnlyStartTypes;
+    }
+    
+    /**
+     * Getter for the read only set with the continuation types
+     * @return the read only set with {@link LexicalCategory LexicalCategories}
+     * that can continue a phrase of that type
+     */
+    public Set<LexicalCategory> getContinuationType(){
+        return readOnlyContinuationTypes;
+    }
+    
+    /**
+     * Getter for the read only set with the required types
+     * @return the read only set with {@link LexicalCategory LexicalCategories}
+     * that MUST occur within a phrase of that type
+     */
+    public Set<LexicalCategory> getRequiredType(){
+        return readOnlyRequiredTypes;
+    }
+    
+    /**
+     * Getter for the read only set with the end types.
+     * @return the read only set with {@link LexicalCategory LexicalCategories}
+     * that can end a phrase of that type
+     */
+    public Set<LexicalCategory> getEndType(){
+        return readOnlyEndTypes;
+    }
+
+    private boolean add(Set<LexicalCategory> set, LexicalCategory...types){
+        boolean changed = false;
+        if(types != null){
+            for(LexicalCategory type : types){
+                if(type != null){
+                    if(set.add(type)){
+                        changed = true;
+                    }
+                }
+            }
+        }
+        return changed;
+    }
+    
+    private boolean remove(Set<LexicalCategory> set, LexicalCategory...types){
+        boolean changed = false;
+        if(types != null){
+            for(LexicalCategory type : types){
+                if(type != null){
+                    if(set.remove(type)){
+                        changed = true;
+                    }
+                }
+            }
+        }
+        return changed;
+    }
+}

Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java Thu Jan  9 15:37:16 2014
@@ -0,0 +1,41 @@
+package org.apache.stanbol.enhancer.engines.poschunker.engine;
+
+import java.util.concurrent.locks.ReadWriteLock;
+
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+
+/**
+ * Implementation of the {@link ChunkFactory} interface used by the 
+ * {@link PhraseBuilder} to create chunks
+ * @author Rupert Westenthaler
+ *
+ */
+public class ChunkFactoryImpl implements ChunkFactory{
+
+    private final AnalysedText at;
+    private final ReadWriteLock lock;
+    
+    public ChunkFactoryImpl(AnalysedText at, ReadWriteLock lock) {
+        this.at = at;
+        this.lock = lock;
+    }
+    
+    @Override
+    public Chunk createChunk(Token start, Token end) {
+        if(start == null || end == null){
+            throw new IllegalArgumentException("Parst start Token '" + start
+                + "' and end Token '" + end +"' MUST NOT be NULL!");
+        }
+        lock.writeLock().lock();
+        try {
+            return at.addChunk(start.getStart(), end.getEnd());
+        } finally {
+            lock.writeLock().unlock();  
+        }
+    }
+
+}

Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Thu Jan  9 15:37:16 2014
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.poschunker.engine;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.isLangaugeConfigured;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A noun phrase detector (chunker) for English and German language base on OpenNLP. Uses the following chunker
+ * models for OpenNLP:
+ * <ul>
+ *     <li>English: http://opennlp.sourceforge.net/models-1.5/en-chunker.bin</li>
+ *     <li>German: http://gromgull.net/blog/2010/01/noun-phrase-chunking-for-the-awful-german-language/</li>
+ * </ul>
+ * The noun phrase detector requires a {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSContentPart} to
+ * be present in the content item and will extend each {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSSentence}
+ * with an array of chunks.
+ * 
+ * @author Sebastian Schaffert
+ */
+@Component(immediate = true, metatype = true, 
+    configurationFactory = true, //allow multiple instances to be configured
+    policy = ConfigurationPolicy.OPTIONAL) //create the default instance with the default config
+@Service
+@Properties(value={
+        @Property(name=EnhancementEngine.PROPERTY_NAME,value="pos-chunker"),
+        @Property(name=PosChunkerEngine.CONFIG_LANGUAGES, value = {"*"}),
+        @Property(name=PosChunkerEngine.MIN_POS_SCORE, 
+            doubleValue=PosChunkerEngine.DEFAULT_MIN_POS_SCORE),
+        @Property(name=PosChunkerEngine.NOUN_PHRASE_STATE, 
+            boolValue=PosChunkerEngine.DEFAULT_NOUN_PHRASE_STATE),
+        @Property(name=PosChunkerEngine.VERB_PHRASE_STATE, 
+            boolValue=PosChunkerEngine.DEFAULT_VERB_PHRASE_STATE),
+        @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
+})
+public class PosChunkerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
+
+    private static final Map<String,Object> SERVICE_PROPERTIES;
+    static {
+        Map<String,Object> props = new HashMap<String,Object>();
+        props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
+            ServiceProperties.ORDERING_NLP_CHUNK);
+        props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, 
+            NlpProcessingRole.Chunking);
+        SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+    }
+    /**
+     * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+     * are the languages given as default value.
+     */
+    public static final String CONFIG_LANGUAGES = "enhancer.engine.poschunker.languages";
+
+    public static final String MIN_POS_SCORE = "enhancer.engine.poschunker.minPosScore";
+    public static final double DEFAULT_MIN_POS_SCORE = 0.5;
+    
+    public static final String NOUN_PHRASE_STATE = "enhancer.engine.poschunker.nounPhrase";
+    public static final boolean DEFAULT_NOUN_PHRASE_STATE = true;
+    public static final String VERB_PHRASE_STATE = "enhancer.engine.poschunker.verbPhrase";
+    public static final boolean DEFAULT_VERB_PHRASE_STATE = false;
+    
+    private static final PhraseTypeDefinition NOUN_PHRASE_TYPE;
+    private static final PhraseTypeDefinition VERB_PHRASE_TYPE;
+
+    //TODO: maybe move this to PhraseTypeDefinition
+    static {
+        PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
+        //start types noun (automatically included) pronoun or determiners, adjectives 
+        nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        //continuation types are nouns, adpositions , pronouns, determiner, adjectives and punctations
+        //optionally one could also allow Adverbs, PronounOrDeterminer
+        nounPD.addContinuationType(LexicalCategory.Adjective, LexicalCategory.Adposition,
+            LexicalCategory.Punctuation); //LexicalCategory.PronounOrDeterminer, LexicalCategory.Adverb, );
+        //end types are the same as start terms
+        nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        //and required types do include a Noun (what is actually included by default)
+        NOUN_PHRASE_TYPE = nounPD;
+
+        PhraseTypeDefinition verbPD = new PhraseTypeDefinition(LexicalCategory.Verb);
+        verbPD.addStartType(LexicalCategory.Adverb);
+        verbPD.addContinuationType(LexicalCategory.Adverb,LexicalCategory.Punctuation);
+        verbPD.addEndType(LexicalCategory.Adverb);
+        //and required types do include a Verbs (what is actually included by default)
+        VERB_PHRASE_TYPE = verbPD;
+    }
+    
+    private static Logger log = LoggerFactory.getLogger(PosChunkerEngine.class);
+
+    private LanguageConfiguration languageConfiguration = new LanguageConfiguration(CONFIG_LANGUAGES, 
+        new String []{"*"});
+    
+
+    private double minPosScore = -1;
+
+    private List<PhraseTypeDefinition> phraseTypeDefinitions;
+    
+    /**
+     * Indicate if this engine can enhance supplied ContentItem, and if it
+     * suggests enhancing it synchronously or asynchronously. The
+     * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+     * just a suggestion from the engine.
+     * <p/>
+     * Returns CANNOT_ENHANCE if the content item does not have a POSContentPart, the language of the content is not
+     * available or no chunker for the language is available.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the introspecting process of the content item
+     *          fails
+     */
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        if(phraseTypeDefinitions.isEmpty()){
+            return CANNOT_ENHANCE; //Nothing to do
+        }
+        String language = getLanguage(this, ci,false);
+        if(language == null){
+            return CANNOT_ENHANCE;
+        }
+        if(!isLangaugeConfigured(this,languageConfiguration,language,false)){
+           return CANNOT_ENHANCE; 
+        }
+        if(getAnalysedText(this,ci,false) == null) {
+            return CANNOT_ENHANCE;
+        }
+
+        // default enhancement is synchronous enhancement
+        return ENHANCE_ASYNC;
+
+    }
+
+    /**
+     * Compute enhancements for supplied ContentItem. The results of the process
+     * are expected to be stored in the metadata of the content item.
+     * <p/>
+     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+     *
+     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+     *          if the underlying process failed to work as
+     *          expected
+     */
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        AnalysedText at = getAnalysedText(this, ci, true);
+        String language = getLanguage(this, ci, true);
+        isLangaugeConfigured(this, languageConfiguration, language, true);
+        //init the PhraseBuilder
+        ChunkFactory chunkFactory = new ChunkFactoryImpl(at, ci.getLock());
+        List<PhraseBuilder> phraseBuilders = new ArrayList<PhraseBuilder>(phraseTypeDefinitions.size());
+        for(PhraseTypeDefinition ptd : phraseTypeDefinitions){
+            phraseBuilders.add(new PhraseBuilder(ptd, chunkFactory, minPosScore));
+        }
+        Iterator<? extends Section> sentences = at.getSentences();
+        if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
+            sentences = Collections.singleton(at).iterator();
+        }
+        while(sentences.hasNext()){
+            // (1) get Tokens and POS information for the sentence
+            Section sentence = sentences.next();
+            for(PhraseBuilder pb : phraseBuilders){
+                pb.nextSection(sentence);
+            }
+            Iterator<Token> tokens = sentence.getTokens();
+            while(tokens.hasNext()){
+                Token token = tokens.next();
+                for(PhraseBuilder pb : phraseBuilders){
+                    pb.nextToken(token);
+                }
+            }
+        }
+        //signal the end of the document
+        for(PhraseBuilder pb : phraseBuilders){
+            pb.nextSection(null);
+        }
+        if(log.isTraceEnabled()){
+            logChunks(at);
+        }
+    }
+    
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return SERVICE_PROPERTIES;
+    }
+    
+    private void logChunks(AnalysedText at){
+        Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
+        while(it.hasNext()){
+            Span span = it.next();
+            if(span.getType() == SpanTypeEnum.Chunk){
+                Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
+                log.trace(" > {} Phrase: {} {}", new Object[]{
+                    phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
+                    span, span.getSpan()});
+                log.trace("  Tokens: ");
+                int i = 1;
+                for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
+                    Token token = tokens.next();
+                    log.trace("    {}. {}{}", new Object[]{i,token.getSpan(),
+                            token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
+                }
+            } else {
+                log.trace("--- {}",span);
+            }
+        }
+    }
+
+    /**
+     * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException {
+        log.info("activating POS tagging engine");
+        super.activate(ce);
+        @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ce.getProperties();
+        
+        //read the min chunk score
+        Object value = properties.get(MIN_POS_SCORE);
+        Double minPosScore;
+        if(value instanceof Number){
+            minPosScore = ((Number)value).doubleValue();
+        } else if (value != null && !value.toString().isEmpty()){
+            try {
+                minPosScore = Double.parseDouble(value.toString());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(MIN_POS_SCORE, 
+                    "The configured minumum chunk score MUST BE a floating point"
+                    + "number in the range > 0 < 1.",e);
+            }
+        } else {
+            minPosScore = null;
+        }
+        if(minPosScore != null && (minPosScore.doubleValue() >= 1d ||
+                minPosScore.doubleValue() < 0d)){
+            throw new ConfigurationException(MIN_POS_SCORE, 
+                "The configured minumum chunk score '"+minPosScore+"' MUST BE a "
+                + "floating point number in the range > 0 < 1.");
+        } else if(minPosScore == null){
+            this.minPosScore = DEFAULT_MIN_POS_SCORE; //set to default
+        } else {
+            this.minPosScore = minPosScore.doubleValue();
+        }
+        log.info(" > set minimum POS score to {} (Engine: {})",
+            this.minPosScore, getName());
+        
+        //read the language configuration
+        languageConfiguration.setConfiguration(properties);
+        
+        //configure the PhraseType definitions
+        phraseTypeDefinitions = new ArrayList<PhraseTypeDefinition>(2);
+        value = properties.get(NOUN_PHRASE_STATE);
+        if((value != null && Boolean.parseBoolean(value.toString())) ||
+                (value == null && DEFAULT_NOUN_PHRASE_STATE)){
+            phraseTypeDefinitions.add(NOUN_PHRASE_TYPE);
+        }
+        value = properties.get(VERB_PHRASE_STATE);
+        if((value != null && Boolean.parseBoolean(value.toString())) ||
+                (value == null && DEFAULT_VERB_PHRASE_STATE)){
+            phraseTypeDefinitions.add(VERB_PHRASE_TYPE);
+        }
+        
+    }
+    
+    @Deactivate
+    protected void deactivate(ComponentContext context){
+        this.languageConfiguration.setDefault();
+        this.minPosScore = -1;
+        super.deactivate(context);
+    }
+   
+
+}

Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties Thu Jan  9 15:37:16 2014
@@ -0,0 +1,44 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+org.apache.stanbol.enhancer.engines.poschunker.engine.PosChunkerEngine.name=Apache \
+Stanbol Enhancer Engine: POS based Chunking / Noun Phrase Detection
+org.apache.stanbol.enhancer.engines.poschunker.engine.PosChunkerEngine.description=Enhancement \
+Engine that extracts Verb/Noun phrases based on the LexicalTypes of POS annotations.
+
+stanbol.enhancer.engine.name.name=name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+enhancer.engine.poschunker.languages.name=Language configuration
+enhancer.engine.poschunker.languages.languages.description=Takes a list of ISO \
+  language codes. '*' is the Wildcard; '!{lang}' to exclude a language
+  
+enhancer.engine.poschunker.minPosScore.name= Min POS confidence
+enhancer.engine.poschunker.minPosScore.description=The minimum confidence of \
+POS annotations so that they are considered by the Chunker
+
+enhancer.engine.poschunker.nounPhrase.name=Noun Phrase
+enhancer.engine.poschunker.nounPhrase.description=Enables/Disables the extraction \
+of Noun Phrases.
+
+enhancer.engine.poschunker.verbPhrase.name=Verb Phrase
+enhancer.engine.poschunker.verbPhrase.description=Enables/Disables the extraction \
+of Verb Phrases.

Modified: stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml?rev=1556839&r1=1556838&r2=1556839&view=diff
==============================================================================
--- stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml (original)
+++ stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml Thu Jan  9 15:37:16 2014
@@ -214,6 +214,13 @@
       <artifactId>org.apache.stanbol.enhancer.engines.restful.nlp</artifactId>
       <version>1.0.0-SNAPSHOT</version>
     </bundle>
+    
+    <bundle> <!-- POS annotation based chunker (STANBOL-1251) -->
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.engines.poschunker</artifactId>
+      <version>1.0.0-SNAPSHOT</version>
+    </bundle>
+    
     <!-- NLP metadata to RDF (using NIF 1.0) - NOT YET READY FOR DEFAULT CONFIG
     <bundle>
       <groupId>org.apache.stanbol</groupId>