You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/09 16:37:17 UTC
svn commit: r1556839 - in /stanbol/trunk: enhancement-engines/
enhancement-engines/pos-chunker/ enhancement-engines/pos-chunker/src/
enhancement-engines/pos-chunker/src/main/
enhancement-engines/pos-chunker/src/main/java/
enhancement-engines/pos-chunke...
Author: rwesten
Date: Thu Jan 9 15:37:16 2014
New Revision: 1556839
URL: http://svn.apache.org/r1556839
Log:
STANBOL-1251: first version of the Pos-Chunker engine
Added:
stanbol/trunk/enhancement-engines/pos-chunker/ (with props)
stanbol/trunk/enhancement-engines/pos-chunker/pom.xml
stanbol/trunk/enhancement-engines/pos-chunker/src/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java
stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/
stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties
Modified:
stanbol/trunk/enhancement-engines/pom.xml
stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml
Modified: stanbol/trunk/enhancement-engines/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pom.xml?rev=1556839&r1=1556838&r2=1556839&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/pom.xml Thu Jan 9 15:37:16 2014
@@ -60,6 +60,7 @@
<module>langdetect</module>
<module>langid</module>
<module>opennlp</module>
+ <module>pos-chunker</module>
<!-- Chinese language support -->
<module>smartcn-token</module> <!-- sentence detection and tokenizing -->
Propchange: stanbol/trunk/enhancement-engines/pos-chunker/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jan 9 15:37:16 2014
@@ -0,0 +1,7 @@
+.settings
+
+.project
+
+.classpath
+
+target
Added: stanbol/trunk/enhancement-engines/pos-chunker/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/pom.xml?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/pom.xml (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/pom.xml Thu Jan 9 15:37:16 2014
@@ -0,0 +1,104 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>apache-stanbol-enhancement-engines</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ <relativePath>..</relativePath>
+ </parent>
+
+ <artifactId>org.apache.stanbol.enhancer.engines.poschunker</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancement Engine : POS tag based Chunker</name>
+ <description>
+ Uses POS tag information of Tokens to create Noun and Verb phrases.
+ </description>
+
+ <inceptionYear>2014</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/opennlp/pos-chunker
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/opennlp/pos-chunker
+ </developerConnection>
+ <url>http://stanbol.apache.org/</url>
+ </scm>
+
+ <properties>
+ <opennlp.model.path>org/apache/stanbol/data/opennlp</opennlp.model.path>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Import-Package>
+ org.apache.stanbol.enhancer.servicesapi; provide:=true; version="[0.11,1.1)",
+ org.apache.stanbol.enhancer.servicesapi.impl; provide:=true; version="[0.11,1.1)",
+ *
+ </Import-Package>
+ <!-- Export the PhraseBuilder and PhraseTypeDefinition -->
+ <Export-Package>
+ org.apache.stanbol.enhancer.engines.poschunker;version=${project.version}
+ </Export-Package>
+ <!-- Keep Engine private as it is used as a service -->
+ <Private-Package>
+ org.apache.stanbol.enhancer.engines.poschunker.engine;version=${project.version}
+ </Private-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <!-- AL20 License -->
+ <exclude>src/license/THIRD-PARTY.properties</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ </dependencies>
+
+</project>
Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Thu Jan 9 15:37:16 2014
@@ -0,0 +1,203 @@
+package org.apache.stanbol.enhancer.engines.poschunker;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+public class PhraseBuilder {
+
+ /**
+ * Just a fallback in case Pos annotations do not provide probabilities.
+ * In most cases the value of this will not have any effect as typically
+ * Pos Taggers that do not provide probabilities only emit a
+ * single POS tag per Token. In such cases this tag will be always accepted
+ * regardless of the configured value. <p>
+ * The value is only important if some Pos annotation for a Token do have
+ * probabilities while others have not. In such cases those without are rated
+ * against other that have by using this value. Such Situations should only
+ * occur if a chain uses several POS taggers - a setting that should be
+ * avoided<p>
+ */
+ private static final double DEFAULT_SCORE = 0.1;
+
+ private final PhraseTypeDefinition phraseType;
+
+ private final ChunkFactory chunkFactory;
+
+ private final double minPosSocre;
+ /**
+ * The {@link PhraseTag} added to all {@link Chunk}s created by this
+ * {@link PhraseBuilder}
+ */
+ private final PhraseTag phraseTag;
+
+ /**
+ * Holds Tokens of a current phrase. Empty if no phrase is building.
+ */
+ private List<Token> current = new ArrayList<Token>();
+ /**
+ * If {@link #current} contains a Tokens matching
+ * {@link PhraseTypeDefinition#getRequiredType()}
+ */
+ boolean valid;
+
+ public PhraseBuilder(PhraseTypeDefinition phraseType, ChunkFactory chunkFactory, double minPosSocre) {
+ if(phraseType == null){
+ throw new IllegalArgumentException("The parsed PhraseTypeDefinition MUST NOT be NULL!");
+ }
+ this.phraseType = phraseType;
+ this.phraseTag = new PhraseTag(phraseType.getPhraseType().name(),
+ phraseType.getPhraseType());
+ if(chunkFactory == null){
+ throw new IllegalArgumentException("The parsed ChunkFactory MUST NOT be NULL");
+ }
+ this.chunkFactory = chunkFactory;
+ if(minPosSocre < 0 || minPosSocre > 1){
+ throw new IllegalArgumentException("The parsed minPosScore '" + minPosSocre
+ + "' MUST BE within the ranve [0..1]!");
+ }
+ this.minPosSocre = minPosSocre;
+ }
+
+
+ public void nextToken(Token token){
+ if(current.isEmpty()){ //check for start
+ checkStart(token);
+ } else if(!checkContinuation(token)){ //check for continuation
+ buildPhrase(token);
+ }
+
+ }
+
+ public void nextSection(Section section){
+ buildPhrase(null);
+ }
+
+
+ @SuppressWarnings("unchecked") //varargs with generic types
+ private void checkStart(Token token){
+ boolean[] states = checkCategories(token, phraseType.getStartType(),
+ phraseType.getRequiredType());
+ if(states[0]){
+ current.add(token);
+ valid = states[1];
+ }
+ }
+
+ @SuppressWarnings("unchecked") //varargs with generic types
+ private boolean checkContinuation(Token token){
+ final boolean[] states;
+ if(!valid){
+ states = checkCategories(token, phraseType.getContinuationType(),
+ phraseType.getRequiredType());
+ } else {
+ states = checkCategories(token, phraseType.getContinuationType());
+ }
+ if(states[0]){
+ current.add(token);
+ }
+ if(states.length > 1){
+ valid = states[1];
+ }
+ return states[0];
+ }
+
+ @SuppressWarnings("unchecked") //varargs with generic types
+ private void buildPhrase(Token token) {
+ Token lastConsumedToken = null;
+ if(valid){
+ //search backwards for the first token matching an allowed end
+ //category
+ int endIndex = current.size()-1;
+ while(endIndex > 0 && !checkCategories(current.get(endIndex),
+ phraseType.getEndType())[0]){
+ endIndex--;
+ }
+ lastConsumedToken = current.get(endIndex);
+ //NOTE: ignore phrases with a single token
+ if(endIndex > 0){
+ Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
+ //TODO: add support for confidence
+ chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
+ }
+ }
+ //cleanup
+ current.clear();
+ valid = false;
+ if(token != null && !token.equals(lastConsumedToken)){
+ //the current token might be the start of a new phrase
+ checkStart(token);
+ }
+ }
+
+ /**
+ * Checks if the a the {@link NlpAnnotations#POS_ANNOTATION POS Annotations}
+ * of a {@link Token} matches the parsed categories. This method supports
+ * to check against multiple sets of categories to allow checking e.g. if a token
+ * is suitable for {@link PhraseTypeDefinition#getStartType()} and
+ * {@link PhraseTypeDefinition#getRequiredType()}.
+ * @param token the Token
+ * @param categories the list of categories to check
+ * @return if the sum of matching annotations compared to the score of all
+ * POS annotations is higher or equals the configured {@link #minPosSocre}.
+ * For each parsed categories set a boolean state is returned.
+ */
+ private boolean[] checkCategories(Token token, Set<LexicalCategory>...categories) {
+ //there are different ways NLP frameworks do assign scores. For some the
+ //sum of all categories would sum up to 1.0, but as only the top three
+ //categories are included the sum would be < 1
+ //Others assign scores so that each score is < 1, but the sum of all
+ //is higher as 1.0.
+ //There is also the possibility that no scores are present.
+
+ //Because of that this sums up all scores and normalizes with the
+ //Match.max(1.0,sumScore).
+ //POS tags without score are assigned a #DEFAULT_SCORE. If not a single
+ //POS tag with a score is present the sumScore is NOT normalized to 1.0
+ boolean scorePresent = false;
+ double sumScore = 0;
+ double[] matchScores = new double[categories.length];
+ for(Value<PosTag> pos : token.getAnnotations(POS_ANNOTATION)){
+ double score = pos.probability();
+ if(score == Value.UNKNOWN_PROBABILITY){
+ score = DEFAULT_SCORE;
+ } else {
+ scorePresent = true;
+ }
+ sumScore = sumScore + pos.probability();
+ Set<LexicalCategory> tokenCategories = pos.value().getCategories();
+ for(int i = 0; i < categories.length; i++){
+ Set<LexicalCategory> category = categories[i];
+ if(!Collections.disjoint(tokenCategories, category)){
+ matchScores[i] = matchScores[i] + pos.probability();
+ }
+ }
+ }
+ boolean[] matches = new boolean[matchScores.length];
+ //the score used to normalize annotations. See comments at method start
+ double normScore = scorePresent ? Math.max(1.0,sumScore) : sumScore;
+ for(int i = 0; i < matchScores.length ; i++){
+ matches[i] = matchScores[i]/normScore >= minPosSocre;
+ }
+ return matches;
+ }
+
+ public static interface ChunkFactory {
+
+ Chunk createChunk(Token start, Token end);
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Thu Jan 9 15:37:16 2014
@@ -0,0 +1,138 @@
+package org.apache.stanbol.enhancer.engines.poschunker;
+
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+
+public class PhraseTypeDefinition {
+
+ protected final LexicalCategory phraseType;
+
+ private final Set<LexicalCategory> startTypes;
+ protected final Set<LexicalCategory> readOnlyStartTypes;
+ private final Set<LexicalCategory> continuationTypes;
+ protected final Set<LexicalCategory> readOnlyContinuationTypes;
+ private final Set<LexicalCategory> requiredTypes;
+ protected final Set<LexicalCategory> readOnlyRequiredTypes;
+ private final Set<LexicalCategory> endTypes;
+ protected final Set<LexicalCategory> readOnlyEndTypes;
+
+ public PhraseTypeDefinition(LexicalCategory phraseType) {
+ if(phraseType == null){
+ throw new IllegalArgumentException("The parsed PhraseType MUST NOT be NULL!");
+ }
+ this.phraseType = phraseType;
+ startTypes = EnumSet.of(phraseType);
+ readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
+ continuationTypes = EnumSet.of(phraseType);
+ readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
+ requiredTypes = EnumSet.of(phraseType);
+ readOnlyRequiredTypes = Collections.unmodifiableSet(requiredTypes);
+ endTypes = EnumSet.of(phraseType);
+ readOnlyEndTypes = Collections.unmodifiableSet(startTypes);
+ }
+
+ public boolean addStartType(LexicalCategory...types){
+ return add(startTypes,types);
+ }
+
+ public boolean addContinuationType(LexicalCategory...types){
+ return add(continuationTypes,types);
+ }
+
+ public boolean addRequiredType(LexicalCategory...types){
+ return add(requiredTypes,types);
+ }
+ public boolean addEndType(LexicalCategory...types){
+ return add(endTypes,types);
+ }
+
+ public boolean removeStartType(LexicalCategory...types){
+ return remove(startTypes,types);
+ }
+
+ public boolean removeContinuationType(LexicalCategory...types){
+ return remove(continuationTypes,types);
+ }
+
+ public boolean removeRequiredType(LexicalCategory...types){
+ return remove(requiredTypes,types);
+ }
+
+ public boolean removeEndType(LexicalCategory...types){
+ return remove(endTypes,types);
+ }
+ /**
+ * Getter for the type of this phrase definition
+ * @return
+ */
+ public LexicalCategory getPhraseType(){
+ return phraseType;
+ }
+
+ /**
+ * Getter for the read only set with the start types
+ * @return the read only set with {@link LexicalCategory LexicalCategories}
+ * that can start a phrase of that type
+ */
+ public Set<LexicalCategory> getStartType(){
+ return readOnlyStartTypes;
+ }
+
+ /**
+ * Getter for the read only set with the continuation types
+ * @return the read only set with {@link LexicalCategory LexicalCategories}
+ * that can continue a phrase of that type
+ */
+ public Set<LexicalCategory> getContinuationType(){
+ return readOnlyContinuationTypes;
+ }
+
+ /**
+ * Getter for the read only set with the required types
+ * @return the read only set with {@link LexicalCategory LexicalCategories}
+ * that MUST occur within a phrase of that type
+ */
+ public Set<LexicalCategory> getRequiredType(){
+ return readOnlyRequiredTypes;
+ }
+
+ /**
+ * Getter for the read only set with the end types.
+ * @return the read only set with {@link LexicalCategory LexicalCategories}
+ * that can end a phrase of that type
+ */
+ public Set<LexicalCategory> getEndType(){
+ return readOnlyEndTypes;
+ }
+
+ private boolean add(Set<LexicalCategory> set, LexicalCategory...types){
+ boolean changed = false;
+ if(types != null){
+ for(LexicalCategory type : types){
+ if(type != null){
+ if(set.add(type)){
+ changed = true;
+ }
+ }
+ }
+ }
+ return changed;
+ }
+
+ private boolean remove(Set<LexicalCategory> set, LexicalCategory...types){
+ boolean changed = false;
+ if(types != null){
+ for(LexicalCategory type : types){
+ if(type != null){
+ if(set.remove(type)){
+ changed = true;
+ }
+ }
+ }
+ }
+ return changed;
+ }
+}
Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/ChunkFactoryImpl.java Thu Jan 9 15:37:16 2014
@@ -0,0 +1,41 @@
+package org.apache.stanbol.enhancer.engines.poschunker.engine;
+
+import java.util.concurrent.locks.ReadWriteLock;
+
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+
+/**
+ * Implementation of the {@link ChunkFactory} interface used by the
+ * {@link PhraseBuilder} to create chunks
+ * @author Rupert Westenthaler
+ *
+ */
+public class ChunkFactoryImpl implements ChunkFactory{
+
+ private final AnalysedText at;
+ private final ReadWriteLock lock;
+
+ public ChunkFactoryImpl(AnalysedText at, ReadWriteLock lock) {
+ this.at = at;
+ this.lock = lock;
+ }
+
+ @Override
+ public Chunk createChunk(Token start, Token end) {
+ if(start == null || end == null){
+ throw new IllegalArgumentException("Parst start Token '" + start
+ + "' and end Token '" + end +"' MUST NOT be NULL!");
+ }
+ lock.writeLock().lock();
+ try {
+ return at.addChunk(start.getStart(), end.getEnd());
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Thu Jan 9 15:37:16 2014
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2012 Sebastian Schaffert
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.stanbol.enhancer.engines.poschunker.engine;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.isLangaugeConfigured;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory;
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
+import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.osgi.framework.Constants;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A noun phrase detector (chunker) for English and German language base on OpenNLP. Uses the following chunker
+ * models for OpenNLP:
+ * <ul>
+ * <li>English: http://opennlp.sourceforge.net/models-1.5/en-chunker.bin</li>
+ * <li>German: http://gromgull.net/blog/2010/01/noun-phrase-chunking-for-the-awful-german-language/</li>
+ * </ul>
+ * The noun phrase detector requires a {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSContentPart} to
+ * be present in the content item and will extend each {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSSentence}
+ * with an array of chunks.
+ *
+ * @author Sebastian Schaffert
+ */
+@Component(immediate = true, metatype = true,
+ configurationFactory = true, //allow multiple instances to be configured
+ policy = ConfigurationPolicy.OPTIONAL) //create the default instance with the default config
+@Service
+@Properties(value={
+ @Property(name=EnhancementEngine.PROPERTY_NAME,value="pos-chunker"),
+ @Property(name=PosChunkerEngine.CONFIG_LANGUAGES, value = {"*"}),
+ @Property(name=PosChunkerEngine.MIN_POS_SCORE,
+ doubleValue=PosChunkerEngine.DEFAULT_MIN_POS_SCORE),
+ @Property(name=PosChunkerEngine.NOUN_PHRASE_STATE,
+ boolValue=PosChunkerEngine.DEFAULT_NOUN_PHRASE_STATE),
+ @Property(name=PosChunkerEngine.VERB_PHRASE_STATE,
+ boolValue=PosChunkerEngine.DEFAULT_VERB_PHRASE_STATE),
+ @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
+})
+public class PosChunkerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
+
+ private static final Map<String,Object> SERVICE_PROPERTIES;
+ static {
+ Map<String,Object> props = new HashMap<String,Object>();
+ props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
+ ServiceProperties.ORDERING_NLP_CHUNK);
+ props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
+ NlpProcessingRole.Chunking);
+ SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
+ }
+ /**
+ * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
+ * are the languages given as default value.
+ */
+ public static final String CONFIG_LANGUAGES = "enhancer.engine.poschunker.languages";
+
+ public static final String MIN_POS_SCORE = "enhancer.engine.poschunker.minPosScore";
+ public static final double DEFAULT_MIN_POS_SCORE = 0.5;
+
+ public static final String NOUN_PHRASE_STATE = "enhancer.engine.poschunker.nounPhrase";
+ public static final boolean DEFAULT_NOUN_PHRASE_STATE = true;
+ public static final String VERB_PHRASE_STATE = "enhancer.engine.poschunker.verbPhrase";
+ public static final boolean DEFAULT_VERB_PHRASE_STATE = false;
+
+ private static final PhraseTypeDefinition NOUN_PHRASE_TYPE;
+ private static final PhraseTypeDefinition VERB_PHRASE_TYPE;
+
+ //TODO: maybe move this to PhraseTypeDefinition
+ static {
+ PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
+ //start types noun (automatically included) pronoun or determiners, adjectives
+ nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ //continuation types are nouns, adpositions , pronouns, determiner, adjectives and punctations
+ //optionally one could also allow Adverbs, PronounOrDeterminer
+ nounPD.addContinuationType(LexicalCategory.Adjective, LexicalCategory.Adposition,
+ LexicalCategory.Punctuation); //LexicalCategory.PronounOrDeterminer, LexicalCategory.Adverb, );
+ //end types are the same as start terms
+ nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+ //and required types do include a Noun (what is actually included by default)
+ NOUN_PHRASE_TYPE = nounPD;
+
+ PhraseTypeDefinition verbPD = new PhraseTypeDefinition(LexicalCategory.Verb);
+ verbPD.addStartType(LexicalCategory.Adverb);
+ verbPD.addContinuationType(LexicalCategory.Adverb,LexicalCategory.Punctuation);
+ verbPD.addEndType(LexicalCategory.Adverb);
+ //and required types do include a Verbs (what is actually included by default)
+ VERB_PHRASE_TYPE = verbPD;
+ }
+
+ private static Logger log = LoggerFactory.getLogger(PosChunkerEngine.class);
+
+ private LanguageConfiguration languageConfiguration = new LanguageConfiguration(CONFIG_LANGUAGES,
+ new String []{"*"});
+
+
+ private double minPosScore = -1;
+
+ private List<PhraseTypeDefinition> phraseTypeDefinitions;
+
+ /**
+ * Indicate if this engine can enhance supplied ContentItem, and if it
+ * suggests enhancing it synchronously or asynchronously. The
+ * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
+ * just a suggestion from the engine.
+ * <p/>
+ * Returns CANNOT_ENHANCE if the content item does not have a POSContentPart, the language of the content is not
+ * available or no chunker for the language is available.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the introspecting process of the content item
+ * fails
+ */
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if(phraseTypeDefinitions.isEmpty()){
+ return CANNOT_ENHANCE; //Nothing to do
+ }
+ String language = getLanguage(this, ci,false);
+ if(language == null){
+ return CANNOT_ENHANCE;
+ }
+ if(!isLangaugeConfigured(this,languageConfiguration,language,false)){
+ return CANNOT_ENHANCE;
+ }
+ if(getAnalysedText(this,ci,false) == null) {
+ return CANNOT_ENHANCE;
+ }
+
+ // default enhancement is synchronous enhancement
+ return ENHANCE_ASYNC;
+
+ }
+
+ /**
+ * Compute enhancements for supplied ContentItem. The results of the process
+ * are expected to be stored in the metadata of the content item.
+ * <p/>
+ * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
+ * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
+ *
+ * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
+ * if the underlying process failed to work as
+ * expected
+ */
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ AnalysedText at = getAnalysedText(this, ci, true);
+ String language = getLanguage(this, ci, true);
+ isLangaugeConfigured(this, languageConfiguration, language, true);
+ //init the PhraseBuilder
+ ChunkFactory chunkFactory = new ChunkFactoryImpl(at, ci.getLock());
+ List<PhraseBuilder> phraseBuilders = new ArrayList<PhraseBuilder>(phraseTypeDefinitions.size());
+ for(PhraseTypeDefinition ptd : phraseTypeDefinitions){
+ phraseBuilders.add(new PhraseBuilder(ptd, chunkFactory, minPosScore));
+ }
+ Iterator<? extends Section> sentences = at.getSentences();
+ if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
+ sentences = Collections.singleton(at).iterator();
+ }
+ while(sentences.hasNext()){
+ // (1) get Tokens and POS information for the sentence
+ Section sentence = sentences.next();
+ for(PhraseBuilder pb : phraseBuilders){
+ pb.nextSection(sentence);
+ }
+ Iterator<Token> tokens = sentence.getTokens();
+ while(tokens.hasNext()){
+ Token token = tokens.next();
+ for(PhraseBuilder pb : phraseBuilders){
+ pb.nextToken(token);
+ }
+ }
+ }
+ //signal the end of the document
+ for(PhraseBuilder pb : phraseBuilders){
+ pb.nextSection(null);
+ }
+ if(log.isTraceEnabled()){
+ logChunks(at);
+ }
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return SERVICE_PROPERTIES;
+ }
+
+ private void logChunks(AnalysedText at){
+ Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
+ while(it.hasNext()){
+ Span span = it.next();
+ if(span.getType() == SpanTypeEnum.Chunk){
+ Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
+ log.trace(" > {} Phrase: {} {}", new Object[]{
+ phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
+ span, span.getSpan()});
+ log.trace(" Tokens: ");
+ int i = 1;
+ for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
+ Token token = tokens.next();
+ log.trace(" {}. {}{}", new Object[]{i,token.getSpan(),
+ token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
+ }
+ } else {
+ log.trace("--- {}",span);
+ }
+ }
+ }
+
+ /**
+ * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+ * CONFIG_LANGUAGES.
+ *
+ * @param ce the {@link org.osgi.service.component.ComponentContext}
+ */
+ @Activate
+ protected void activate(ComponentContext ce) throws ConfigurationException {
+ log.info("activating POS tagging engine");
+ super.activate(ce);
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ce.getProperties();
+
+ //read the min chunk score
+ Object value = properties.get(MIN_POS_SCORE);
+ Double minPosScore;
+ if(value instanceof Number){
+ minPosScore = ((Number)value).doubleValue();
+ } else if (value != null && !value.toString().isEmpty()){
+ try {
+ minPosScore = Double.parseDouble(value.toString());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(MIN_POS_SCORE,
+ "The configured minumum chunk score MUST BE a floating point"
+ + "number in the range > 0 < 1.",e);
+ }
+ } else {
+ minPosScore = null;
+ }
+ if(minPosScore != null && (minPosScore.doubleValue() >= 1d ||
+ minPosScore.doubleValue() < 0d)){
+ throw new ConfigurationException(MIN_POS_SCORE,
+ "The configured minumum chunk score '"+minPosScore+"' MUST BE a "
+ + "floating point number in the range > 0 < 1.");
+ } else if(minPosScore == null){
+ this.minPosScore = DEFAULT_MIN_POS_SCORE; //set to default
+ } else {
+ this.minPosScore = minPosScore.doubleValue();
+ }
+ log.info(" > set minimum POS score to {} (Engine: {})",
+ this.minPosScore, getName());
+
+ //read the language configuration
+ languageConfiguration.setConfiguration(properties);
+
+ //configure the PhraseType definitions
+ phraseTypeDefinitions = new ArrayList<PhraseTypeDefinition>(2);
+ value = properties.get(NOUN_PHRASE_STATE);
+ if((value != null && Boolean.parseBoolean(value.toString())) ||
+ (value == null && DEFAULT_NOUN_PHRASE_STATE)){
+ phraseTypeDefinitions.add(NOUN_PHRASE_TYPE);
+ }
+ value = properties.get(VERB_PHRASE_STATE);
+ if((value != null && Boolean.parseBoolean(value.toString())) ||
+ (value == null && DEFAULT_VERB_PHRASE_STATE)){
+ phraseTypeDefinitions.add(VERB_PHRASE_TYPE);
+ }
+
+ }
+
+ @Deactivate
+ protected void deactivate(ComponentContext context){
+ this.languageConfiguration.setDefault();
+ this.minPosScore = -1;
+ super.deactivate(context);
+ }
+
+
+}
Added: stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1556839&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ stanbol/trunk/enhancement-engines/pos-chunker/src/main/resources/OSGI-INF/metatype/metatype.properties Thu Jan 9 15:37:16 2014
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+org.apache.stanbol.enhancer.engines.poschunker.engine.PosChunkerEngine.name=Apache \
+Stanbol Enhancer Engine: POS based Chunking / Noun Phrase Detection
+org.apache.stanbol.enhancer.engines.poschunker.engine.PosChunkerEngine.description=Enhancement \
+Engine that extracts Verb/Noun phrases based on the LexicalTypes of POS annotations.
+
+stanbol.enhancer.engine.name.name=name
+stanbol.enhancer.engine.name.description=The name of the enhancement engine as \
+used in the RESTful interface '/engine/<name>'
+service.ranking.name=Ranking
+service.ranking.description=If two enhancement engines with the same name are active the \
+one with the higher ranking will be used to process parsed content items.
+
+enhancer.engine.poschunker.languages.name=Language configuration
+enhancer.engine.poschunker.languages.languages.description=Takes a list of ISO \
+ language codes. '*' is the Wildcard; '!{lang}' to exclude a language
+
+enhancer.engine.poschunker.minPosScore.name= Min POS confidence
+enhancer.engine.poschunker.minPosScore.description=The minimum confidence of \
+POS annotations so that they are considered by the Chunker
+
+enhancer.engine.poschunker.nounPhrase.name=Noun Phrase
+enhancer.engine.poschunker.nounPhrase.description=Enables/Disables the extraction \
+of Noun Phrases.
+
+enhancer.engine.poschunker.verbPhrase.name=Verb Phrase
+enhancer.engine.poschunker.verbPhrase.description=Enables/Disables the extraction \
+of Verb Phrases.
Modified: stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml?rev=1556839&r1=1556838&r2=1556839&view=diff
==============================================================================
--- stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml (original)
+++ stanbol/trunk/launchers/bundlelists/enhancer/src/main/bundles/list.xml Thu Jan 9 15:37:16 2014
@@ -214,6 +214,13 @@
<artifactId>org.apache.stanbol.enhancer.engines.restful.nlp</artifactId>
<version>1.0.0-SNAPSHOT</version>
</bundle>
+
+ <bundle> <!-- POS annotation based chunker (STANBOL-1251) -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.poschunker</artifactId>
+ <version>1.0.0-SNAPSHOT</version>
+ </bundle>
+
<!-- NLP metadata to RDF (using NIF 1.0) - NOT YET READY FOR DEFAULT CONFIG
<bundle>
<groupId>org.apache.stanbol</groupId>