You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/02/24 06:12:03 UTC

svn commit: r1571145 - in /stanbol/branches/release-0.12/enhancement-engines: entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ entityhublinking/src/main/resources/OSGI-INF/metatype/ entitylinking/engine/src/main/java...

Author: rwesten
Date: Mon Feb 24 05:12:03 2014
New Revision: 1571145

URL: http://svn.apache.org/r1571145
Log:
Implementation of STANBOL-1285, STANBOL-1284, STANBOL-1283 and STANBOL-1282 for the 0.12-release branch.

Modified:
    stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
    stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
    stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
    stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java

Modified: stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java Mon Feb 24 05:12:03 2014
@@ -26,7 +26,9 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_INCLUDE_SIMILAR_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.INCLUDE_SIMILAR_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
@@ -108,6 +110,7 @@ import org.slf4j.LoggerFactory;
     @Property(name=MIN_SEARCH_TOKEN_LENGTH, intValue=DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
     @Property(name=MIN_TOKEN_SCORE,floatValue=DEFAULT_MIN_TOKEN_SCORE),
     @Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
+    @Property(name=INCLUDE_SIMILAR_SCORE, boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
     @Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
     @Property(name=PROCESSED_LANGUAGES,
         cardinality=Integer.MAX_VALUE,

Modified: stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Feb 24 05:12:03 2014
@@ -62,6 +62,11 @@ enhancer.engines.linking.suggestions.nam
 enhancer.engines.linking.suggestions.description=The maximal \
 number of suggestions returned for a single mention. 
 
+enhancer.engines.linking.includeSimilarScore.name=Include Similar Score Suggestions
+enhancer.engines.linking.includeSimilarScore.description= If enabled all suggestions \
+with a similar score as the last one will be included in the result. Enabling this \
+will result in more entities being suggested as configured by 'Max Suggestions'
+
 enhancer.engines.linking.minFoundTokens.name=Number of Required Tokens
 enhancer.engines.linking.minFoundTokens.description=For lookups with \
 several words (e.g. Dr Patrick Marshall) this is the minimum number of Tokens the label of an \

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Mon Feb 24 05:12:03 2014
@@ -92,6 +92,11 @@ public class EntityLinkerConfig {
      */
     public static final String SUGGESTIONS = "enhancer.engines.linking.suggestions";
     /**
+     * If enabled Suggestions with similar scores are included. This means also that
+     * there might me more as {@link #SUGGESTIONS} results returned by the engine.
+     */
+    public static final String INCLUDE_SIMILAR_SCORE = "enhancer.engines.linking.includeSimilarScore";
+    /**
      * If enabled {@link MorphoFeatures#getLemma()} values are used instead of the {@link Token#getSpan()} to
      * search/match Entities within the Vocabulary linked against.
      * @see EntityLinkerConfig#isLemmaMatching()
@@ -218,6 +223,10 @@ public class EntityLinkerConfig {
      */
     public static final int DEFAULT_SUGGESTIONS = 3;
     /**
+     * By default {@link #INCLUDE_SIMILAR_SCORE} is deactivated
+     */
+    public static final boolean DEFAULT_INCLUDE_SIMILAR_SCORE = false;
+    /**
      * Default value for the number of tokens that must be contained in
      * suggested terms. The default is <code>1</code>
      */
@@ -360,6 +369,8 @@ public class EntityLinkerConfig {
      * The the maximum number of terms suggested for a word
      */
     private int maxSuggestions = DEFAULT_SUGGESTIONS;
+    
+    private boolean includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE;
     /**
      * The minimum number of Tokens in the text that must match with 
      * a label of the Entity so that also non-exact matches are
@@ -585,6 +596,13 @@ public class EntityLinkerConfig {
             }
             linkerConfig.setMaxSuggestions(maxSuggestions);
         }
+        //init INCLUDE_SIMILAR_SCORE
+        value = configuration.get(INCLUDE_SIMILAR_SCORE);
+        if(value instanceof Boolean){
+            linkerConfig.setIncludeSuggestionsWithSimilarScore((Boolean)value);
+        } else if(value != null){
+            linkerConfig.setIncludeSuggestionsWithSimilarScore(Boolean.parseBoolean(value.toString()));
+        }
         
         //init MIN_FOUND_TOKENS
         value = configuration.get(MIN_FOUND_TOKENS);
@@ -1047,6 +1065,18 @@ public class EntityLinkerConfig {
     public int getMaxSuggestions() {
         return maxSuggestions;
     }
+    
+    public boolean isIncludeSuggestionsWithSimilarScore(){
+        return includeSuggestionsWithSimilarScore;
+    }
+    public void setIncludeSuggestionsWithSimilarScore(Boolean state){
+        if(state == null){
+            includeSuggestionsWithSimilarScore = DEFAULT_INCLUDE_SIMILAR_SCORE;
+        } else {
+            includeSuggestionsWithSimilarScore = state;
+        }
+    }
+    
     /**
      * Setter for the minimum number of Tokens (of the content) that MUST match
      * with a {@link EntitySearcher#getNameField() label} of a 

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Mon Feb 24 05:12:03 2014
@@ -54,6 +54,13 @@ public class LanguageProcessingConfig im
             EnumSet.of(LexicalCategory.Noun, LexicalCategory.Quantifier,LexicalCategory.Residual);
     
     /**
+     * The default set of {@link Pos} used to match (and search) for Entities <p>
+     * Matched Tokens are not used for linking, but are considered when matching
+     * label tokens of Entities with the Text.
+     */
+    public static final Set<Pos> DEFAULT_MATCHED_POS = EnumSet.of(Pos.Gerund);
+    
+    /**
      * The default set of {@link Pos} types that are used to lookup (link) Entities.
      * By defualt only {@link Pos#ProperNoun}s and two 
      * {@link LexicalCategory#Residual} acronyms and
@@ -139,6 +146,10 @@ public class LanguageProcessingConfig im
 
     private Set<LexicalCategory> matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
 
+    private Set<Pos> matchedPos = DEFAULT_MATCHED_POS;
+    
+    private Set<String> matchedPosTags = Collections.emptySet();
+    
     /**
      * The linked {@link Pos} categories
      */
@@ -245,6 +256,42 @@ public class LanguageProcessingConfig im
         }
     }
     /**
+     * Getter for the set of {@link Pos} tags used to match label tokens of
+     * suggested Entities
+     * @return the set of {@link Pos} tags used for matching
+     */
+    public Set<Pos> getMatchedPos(){
+        return matchedPos;
+    }
+    /**
+     * Setter for the matched {@link Pos} tags
+     * @param pos the set or <code>null</code>
+     * to set the {@link #DEFAULT_MATCHED_POS}
+     */
+    public void setMatchedPos(Set<Pos> pos) {
+        if(pos == null){
+            this.matchedPos = DEFAULT_MATCHED_POS;
+        } else {
+            this.matchedPos = EnumSet.noneOf(Pos.class);
+            this.matchedPos.addAll(pos);
+        }
+    }
+    public Set<String> getMatchedPosTags(){
+        return matchedPosTags;
+    }
+    
+    public void setMatchedPosTags(Set<String> matchedPosTags){
+        if(matchedPosTags == null){
+            this.matchedPosTags = Collections.emptySet();
+        } else if(matchedPosTags.contains(null)){
+            throw new IllegalArgumentException("The parsed set with matched POS tags MUST NOT contain the NULL element!");
+        } else {
+            this.matchedPosTags = matchedPosTags;
+        }
+
+    }
+    
+    /**
      * The set of tags used for linking. This is useful if the string tags
      * used by the POS tagger are not mapped to {@link LexicalCategory} nor
      * {@link Pos} enum members. 

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Mon Feb 24 05:12:03 2014
@@ -54,6 +54,10 @@ public class ChunkData {
     /** the end token index relative to the current section (sentence) */
     int endToken;
     /**
+     * If this chunk has a linkable token
+     */
+    boolean hasLinkable = false;
+    /**
      * The number of matchable Tokens enclosed by this Chunk
      */
     int matchableCount;
@@ -129,6 +133,13 @@ public class ChunkData {
     public boolean isNamedEntity() {
     	return isNamedEntity;
     }
+    /**
+     * If this chunk covers a linkable token
+     * @return
+     */
+    public boolean hasLinkable(){
+        return hasLinkable;
+    }
     
     /**
      * Getter for the number of matchable tokens contained in this chunk

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Mon Feb 24 05:12:03 2014
@@ -315,15 +315,28 @@ public class EntityLinker {
                     log.warn(" currnet ranking : {}",suggestions);
                     log.warn("  ... this will result in worng confidence values relative to the best match");
                 }
+                int maxSuggestions = linkerConfig.getMaxSuggestions();
+                if(suggestions.size() > maxSuggestions && 
+                        linkerConfig.isIncludeSuggestionsWithSimilarScore()){
+                    //include suggestions with similar score
+                    double minIncludeScore = suggestions.get(maxSuggestions).getScore();
+                    int numInclude = maxSuggestions + 1; //the next element
+                    double actScore;
+                    do {
+                        actScore = suggestions.get(numInclude).getScore();
+                        numInclude++; //increase for the next iteration
+                    } while(numInclude < suggestions.size() && actScore >= minIncludeScore);
+                    maxSuggestions = numInclude - 1;
+                }
+                //remove all suggestions > maxSuggestions
+                if(suggestions.size() > maxSuggestions){
+                    suggestions.subList(maxSuggestions,suggestions.size()).clear();
+                }
                 //adapt equals rankings based on the entity rank
                 if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
                     adaptScoresForEntityRankings(suggestions);
                     adaptScoresForEntityRankings(partialMatches);
                 }
-                //remove all suggestions > config.maxSuggestions
-                if(suggestions.size() > linkerConfig.getMaxSuggestions()){
-                    suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
-                }
                 if(log.isDebugEnabled()){
                     log.debug("  >> Suggestions:");
                     int i=0;

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Mon Feb 24 05:12:03 2014
@@ -153,6 +153,7 @@ public class SectionData {
                 while(activeChunkIt.hasNext()){
                 	ChunkData activeChunk = activeChunkIt.next();
                     if (tokenData.isLinkable){
+                        activeChunk.hasLinkable = true;
                         //ignore matchableCount in Chunks with linkable Tokens
                         activeChunk.matchableCount = -10; //by setting the count to -10
                     } else if(tokenData.isMatchable){

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java Mon Feb 24 05:12:03 2014
@@ -163,8 +163,9 @@ public class TokenData {
             for(Value<PosTag> posAnnotation : posAnnotations){
                 PosTag posTag = posAnnotation.value();
                 if(posTag.isMapped()){
-                    if(!Collections.disjoint(tpc.getMatchedLexicalCategories(), 
-                        posTag.getCategories())){
+                    if((!Collections.disjoint(tpc.getMatchedLexicalCategories(), posTag.getCategories())) ||
+                            (!Collections.disjoint(tpc.getMatchedPos(), posTag.getPosHierarchy())) ||
+                            tpc.getMatchedPosTags().contains(posTag.getTag())){
                         if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
                                 posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
                             //override selectedPosTag if present

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java Mon Feb 24 05:12:03 2014
@@ -24,6 +24,7 @@ import java.lang.ref.WeakReference;
 import java.security.AccessController;
 import java.security.PrivilegedActionException;
 import java.security.PrivilegedExceptionAction;
+import java.text.SimpleDateFormat;
 import java.util.Date;
 
 import org.apache.commons.io.FileUtils;
@@ -176,6 +177,7 @@ public class CorpusInfo {
         if(corpus != null){
             //on first usage replace a WeakReference with a SoftReference
             if(taggerCorpusRef instanceof WeakReference<?>){
+                log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
                 taggerCorpusRef.clear();
                 taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
             }
@@ -183,6 +185,7 @@ public class CorpusInfo {
             taggerCorpusRef = null; //reset to null as the reference was taken
         }
         if(corpus == null) {
+            log.info(" ... load FST corpus {}",fst);
             try { //STANBOL-1177: load FST models in AccessController.doPrivileged(..)
                 corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
                     public TaggerFstCorpus run() throws IOException {
@@ -194,9 +197,17 @@ public class CorpusInfo {
                                 //I need to set fstDate here, because I can not
                                 //access lastModified() outside doPrivileged
                                 fstDate = new Date(fst.lastModified());
+                                if(log.isInfoEnabled()){
+                                    log.info(" ... loaded FST (date: {})", 
+                                        SimpleDateFormat.getDateTimeInstance().format(fstDate));
+                                }
+                            } else {
+                                log.warn(" ... no corpus loaded from {}",fst);
                             }
                             return corpus;
                         } else {
+                            log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
+                                new Object[]{fst, fst.exists(),fstFileError});
                             return null;
                         }
                     }

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Mon Feb 24 05:12:03 2014
@@ -294,12 +294,25 @@ public class FstLinkingEngine implements
             } else if(suggestions.size() > 1){ //if we have multiple suggestions
                 //sort based on score
                 Collections.sort(suggestions, Match.SCORE_COMPARATOR);
+                int maxSuggestions = elConfig.getMaxSuggestions();
+                if(suggestions.size() > maxSuggestions && 
+                        elConfig.isIncludeSuggestionsWithSimilarScore()){
+                    //include suggestions with similar score
+                    double minIncludeScore = suggestions.get(maxSuggestions).getScore();
+                    int numInclude = maxSuggestions + 1; //the next element
+                    double actScore;
+                    do {
+                        actScore = suggestions.get(numInclude).getScore();
+                        numInclude++; //increase for the next iteration
+                    } while(numInclude < suggestions.size() && actScore >= minIncludeScore);
+                    maxSuggestions = numInclude - 1;
+                }
+                //remove all suggestions > maxSuggestions
+                if(suggestions.size() > maxSuggestions){
+                    suggestions.subList(maxSuggestions,suggestions.size()).clear();
+                }
                 //adapt score based on entity ranking
                 adaptScoresForEntityRankings(suggestions);
-                //cut the list on the maximum nuber of suggestions
-                if(suggestions.size() > elConfig.getMaxSuggestions()){
-                    suggestions = suggestions.subList(0, elConfig.getMaxSuggestions());
-                }
             }
             if(log.isTraceEnabled()){ //log the suggestion information
                 log.trace("Suggestions:");

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Mon Feb 24 05:12:03 2014
@@ -18,9 +18,11 @@ package org.apache.stanbol.enhancer.engi
 
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.CASE_SENSITIVE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_INCLUDE_SIMILAR_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_MATCHING_LANGUAGE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEFAULT_SUGGESTIONS;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.INCLUDE_SIMILAR_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.SUGGESTIONS;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.TYPE_MAPPINGS;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
@@ -154,6 +156,7 @@ import com.google.common.util.concurrent
     @Property(name=FstLinkingEngineComponent.ENTITY_CACHE_SIZE, 
         intValue=FstLinkingEngineComponent.DEFAULT_ENTITY_CACHE_SIZE),
     @Property(name=SUGGESTIONS, intValue=DEFAULT_SUGGESTIONS),
+    @Property(name=INCLUDE_SIMILAR_SCORE, boolValue=DEFAULT_INCLUDE_SIMILAR_SCORE),
     @Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=PROCESS_ONLY_PROPER_NOUNS_STATE, boolValue=DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE),
     @Property(name=PROCESSED_LANGUAGES, cardinality=Integer.MAX_VALUE,

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/LinkableTokenFilter.java Mon Feb 24 05:12:03 2014
@@ -228,13 +228,34 @@ public final class LinkableTokenFilter e
             	}
                 first = false;
                 if(token.isLinkable){
+                    log.trace("  + lookup because {} is linkable", token);
                     lookup = true;
                 } else if (token.isMatchable){
                     lastMatchable = token.index;
                     lastIndex = lastMatchable;
-                } //else if(token.hasAlphaNumeric){
-                //    lastIndex = token.index;
-                //}
+                }
+                //special rules for processable chunks (typically noun phrases)
+                //accept all tokens in processable chunks with a linkable or
+                //multiple matchable tokens.
+                if(!lookup && (!lpc.isIgnoreChunks()) && token.inChunk != null 
+                        && token.inChunk.isProcessable){
+                    if(token.inChunk.isNamedEntity()){
+                        if(log.isTraceEnabled()){
+                            log.trace("  + lookup because {} is part of Named Entity '{}'",
+                               token.token, token.inChunk.chunk.getSpan());
+                        }
+                        lookup = true;
+                    }
+                    if(token.inChunk.hasLinkable() || 
+                            (lpc.isLinkMultiMatchableTokensInChunk() && 
+                                    token.inChunk.getMatchableCount() > 1)){
+                        if(log.isTraceEnabled()){
+                            log.trace("  + lookup because {} is part of a linkable chunk '{}'", 
+                                token.token, token.inChunk.chunk.getSpan());
+                        }
+                        lookup = true;
+                    }
+                }
             }
             //lookahead
             if(!lookup && lastIndex >= 0 && sectionData != null){

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/Match.java Mon Feb 24 05:12:03 2014
@@ -126,6 +126,10 @@ public class Match {
     public void updateScore(double score) {
         this.score = score;
     }
+    /**
+     * The score 
+     * @return the score
+     */
     public double getScore() {
         return score;
     }

Modified: stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Feb 24 05:12:03 2014
@@ -102,6 +102,11 @@ The EntityCache is a LRU cache for such 
 enhancer.engines.linking.suggestions.name=Max Suggestions
 enhancer.engines.linking.suggestions.description=The maximum number of suggestions
 
+enhancer.engines.linking.includeSimilarScore.name=Include Similar Score Suggestions
+enhancer.engines.linking.includeSimilarScore.description= If enabled all suggestions \
+with a similar score as the last one will be included in the result. Enabling this \
+will result in more entities being suggested as configured by 'Max Suggestions'
+
 enhancer.engines.linking.minSearchTokenLength.name=Min Token Length
 enhancer.engines.linking.minSearchTokenLength.description=The minimum \
 length of Tokens used to lookup Entities within the Controlled Vocabulary. This parameter is ignored \

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseBuilder.java Mon Feb 24 05:12:03 2014
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 
+import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition.TokenTypeDefinition;
 import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -105,7 +106,7 @@ public class PhraseBuilder {
     
     public void nextSection(Section section){
         buildPhrase(null);
-    	log.trace("-- next {} --", section);
+    	log.debug("-- next {} --", section);
     }
     
 
@@ -115,12 +116,14 @@ public class PhraseBuilder {
             phraseType.getRequiredType());
         if(states[0]){
             current.add(token);
-            if(log.isTraceEnabled()) {
-	        	log.trace("-- {} phrase start --", phraseType.getPhraseType().name());
-	        	log.trace(" {}. {} {}", new Object[]{ current.size(), token, 
+            if(log.isDebugEnabled()) {
+	        	log.debug("-- {} phrase start --", phraseType.getPhraseType().name());
+	        	log.debug(" {}. {} {}", new Object[]{ current.size(), token, 
 	        			logPosCategories(token)});
             }
             valid = states[1];
+        } else if(log.isTraceEnabled()){
+            log.trace("  - {} {}", token, logPosCategories(token));
         }
     }
 
@@ -135,8 +138,8 @@ public class PhraseBuilder {
         }
         if(states[0]){
             current.add(token);
-            if(log.isTraceEnabled()) {
-	        	log.trace(" {}. {} {}", new Object[]{ current.size(), token, 
+            if(log.isDebugEnabled()) {
+	        	log.debug(" {}. {} {}", new Object[]{ current.size(), token, 
 	        			logPosCategories(token)});
             }
         }
@@ -163,17 +166,17 @@ public class PhraseBuilder {
                 Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken);
                 //TODO: add support for confidence
                 chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag));
-                if(log.isTraceEnabled()){
-                	log.trace("  << add {} phrase {} '{}'", new Object[]{
+                if(log.isDebugEnabled()){
+                	log.debug("  << add {} phrase {} '{}'", new Object[]{
                 			phraseType.getPhraseType().name(), chunk,chunk.getSpan()});
                 }
-            } else if(log.isTraceEnabled()){
-            	log.trace("  >> ignore {} phrase with single {} ", 
+            } else if(log.isDebugEnabled()){
+            	log.debug("  >> ignore {} phrase with single {} ", 
             			phraseType.getPhraseType().name() ,
             			current.get(0));
             }
-        } else if(!current.isEmpty() && log.isTraceEnabled()){
-        	log.trace("  << ignore invalid {} phrase [{},{}]",  new Object[]{ 
+        } else if(!current.isEmpty() && log.isDebugEnabled()){
+        	log.debug("  << ignore invalid {} phrase [{},{}]",  new Object[]{ 
         			phraseType.getPhraseType().name(), current.get(0).getStart(), 
         			current.get(current.size()-1).getEnd()});
         }
@@ -193,12 +196,12 @@ public class PhraseBuilder {
      * is suitable for {@link PhraseTypeDefinition#getStartType()} and
      * {@link PhraseTypeDefinition#getRequiredType()}.
      * @param token the Token
-     * @param categories the list of categories to check
+     * @param ttd the list of categories to check
      * @return if the sum of matching annotations compared to the score of all
      * POS annotations is higher or equals the configured {@link #minPosSocre}.
      * For each parsed categories set a boolean state is returned.
      */
-    private boolean[] checkCategories(Token token, Set<LexicalCategory>...categories) {
+    private boolean[] checkCategories(Token token, TokenTypeDefinition...ttd) {
         //there are different ways NLP frameworks do assign scores. For some the
         //sum of all categories would sum up to 1.0, but as only the top three
         //categories are included the sum would be < 1
@@ -210,22 +213,28 @@ public class PhraseBuilder {
         //Match.max(1.0,sumScore).
         //POS tags without score are assigned a #DEFAULT_SCORE. If not a single
         //POS tag with a score is present the sumScore is NOT normalized to 1.0
+        log.trace("> check Categories for {}",token);
+        if(log.isTraceEnabled()){
+            for(int i = 0; i < ttd.length; i++){
+                log.trace( "Cat {}: {}",i,ttd[i]);
+            }
+        }
         boolean scorePresent = false;
         double sumScore = 0;
-        double[] matchScores = new double[categories.length];
+        double[] matchScores = new double[ttd.length];
         for(Value<PosTag> pos : token.getAnnotations(POS_ANNOTATION)){
+            log.trace(" - {}",pos);
             double score = pos.probability();
             if(score == Value.UNKNOWN_PROBABILITY){
                 score = DEFAULT_SCORE;
             } else {
                 scorePresent = true;
             }
-            sumScore = sumScore + pos.probability();
-            Set<LexicalCategory> tokenCategories = pos.value().getCategories();
-            for(int i = 0; i < categories.length; i++){
-                Set<LexicalCategory> category = categories[i];
-                if(!Collections.disjoint(tokenCategories, category)){
-                    matchScores[i] = matchScores[i] + pos.probability();
+            sumScore = sumScore + score;
+            for(int i = 0; i < ttd.length; i++){
+                if(ttd[i].matches(pos.value())){
+                    log.trace("  matches Category {} with score {}",i,score);
+                    matchScores[i] = matchScores[i] + score;
                 }
             }
         }

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/PhraseTypeDefinition.java Mon Feb 24 05:12:03 2014
@@ -18,10 +18,13 @@ package org.apache.stanbol.enhancer.engi
 
 import java.util.Collections;
 import java.util.EnumSet;
+import java.util.HashSet;
 import java.util.Set;
 
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 
 /**
  * Definition of a phrase type<p>
@@ -47,72 +50,24 @@ public class PhraseTypeDefinition {
 
     protected final LexicalCategory phraseType;
     
-    private final Set<LexicalCategory> startTypes;
-    protected final Set<LexicalCategory> readOnlyStartTypes;
-    private final Set<LexicalCategory> prefixTypes;
-    protected final Set<LexicalCategory> readOnlyPrefixTypes;
-    private final Set<LexicalCategory> continuationTypes;
-    protected final Set<LexicalCategory> readOnlyContinuationTypes;
-    private final Set<LexicalCategory> requiredTypes;
-    protected final Set<LexicalCategory> readOnlyRequiredTypes;
-    private final Set<LexicalCategory> endTypes;
-    protected final Set<LexicalCategory> readOnlyEndTypes;
+    private final TokenTypeDefinition startTypeDefinition;
+    private final TokenTypeDefinition prefixTypeDefinition;
+    private final TokenTypeDefinition continuationTypeDefinition;
+    private final TokenTypeDefinition requiredTypeDefinition;
+    private final TokenTypeDefinition endTypeDefinition;
     
     public PhraseTypeDefinition(LexicalCategory phraseType) {
         if(phraseType == null){
             throw new IllegalArgumentException("The parsed PhraseType MUST NOT be NULL!");
         }
         this.phraseType = phraseType;
-        startTypes = EnumSet.of(phraseType);
-        readOnlyStartTypes = Collections.unmodifiableSet(startTypes);
-        prefixTypes = EnumSet.of(phraseType);
-        readOnlyPrefixTypes = Collections.unmodifiableSet(prefixTypes);
-        continuationTypes = EnumSet.of(phraseType);
-        readOnlyContinuationTypes = Collections.unmodifiableSet(continuationTypes);
-        requiredTypes = EnumSet.of(phraseType);
-        readOnlyRequiredTypes = Collections.unmodifiableSet(requiredTypes);
-        endTypes = EnumSet.of(phraseType);
-        readOnlyEndTypes = Collections.unmodifiableSet(startTypes);
+        startTypeDefinition = new TokenTypeDefinition(phraseType);
+        prefixTypeDefinition = new TokenTypeDefinition(phraseType);
+        continuationTypeDefinition = new TokenTypeDefinition(phraseType);
+        requiredTypeDefinition = new TokenTypeDefinition(phraseType);
+        endTypeDefinition = new TokenTypeDefinition(phraseType);
     }
     
-    public boolean addStartType(LexicalCategory...types){
-        return add(startTypes,types);
-    }
-
-    public boolean addPrefixType(LexicalCategory...types){
-        return add(prefixTypes,types);
-    }
-    
-    public boolean addContinuationType(LexicalCategory...types){
-        return add(continuationTypes,types);
-    }
-    
-    public boolean addRequiredType(LexicalCategory...types){
-        return add(requiredTypes,types);
-    }
-    public boolean addEndType(LexicalCategory...types){
-        return add(endTypes,types);
-    }
-    
-    public boolean removeStartType(LexicalCategory...types){
-        return remove(startTypes,types);
-    }
-    
-    public boolean removePrefixType(LexicalCategory...types){
-        return remove(prefixTypes,types);
-    }
-    
-    public boolean removeContinuationType(LexicalCategory...types){
-        return remove(continuationTypes,types);
-    }
-    
-    public boolean removeRequiredType(LexicalCategory...types){
-        return remove(requiredTypes,types);
-    }
-
-    public boolean removeEndType(LexicalCategory...types){
-        return remove(endTypes,types);
-    }
     /**
      * Getter for the type of this phrase definition
      * @return
@@ -126,8 +81,8 @@ public class PhraseTypeDefinition {
      * @return the read only set with {@link LexicalCategory LexicalCategories}
      * that can start a phrase of that type
      */
-    public Set<LexicalCategory> getStartType(){
-        return readOnlyStartTypes;
+    public TokenTypeDefinition getStartType(){
+        return startTypeDefinition;
     }
     /**
      * Getter for the read only set with the prefix types
@@ -138,8 +93,8 @@ public class PhraseTypeDefinition {
      * considered in prefixes (e.g. "A nice weekend") but excluded after the
      * first noun (e.g. "the trip last week"). 
      */
-    public Set<LexicalCategory> getPrefixType(){
-        return readOnlyPrefixTypes;
+    public TokenTypeDefinition getPrefixType(){
+        return prefixTypeDefinition;
     }
     
     /**
@@ -151,8 +106,8 @@ public class PhraseTypeDefinition {
      * considered in prefixes (e.g. "A nice weekend") but excluded after the
      * first noun (e.g. "the trip last week"). 
      */
-    public Set<LexicalCategory> getContinuationType(){
-        return readOnlyContinuationTypes;
+    public TokenTypeDefinition getContinuationType(){
+        return continuationTypeDefinition;
     }
     
     /**
@@ -160,8 +115,8 @@ public class PhraseTypeDefinition {
      * @return the read only set with {@link LexicalCategory LexicalCategories}
      * that MUST occur within a phrase of that type
      */
-    public Set<LexicalCategory> getRequiredType(){
-        return readOnlyRequiredTypes;
+    public TokenTypeDefinition getRequiredType(){
+        return requiredTypeDefinition;
     }
     
     /**
@@ -169,40 +124,263 @@ public class PhraseTypeDefinition {
      * @return the read only set with {@link LexicalCategory LexicalCategories}
      * that can end a phrase of that type
      */
-    public Set<LexicalCategory> getEndType(){
-        return readOnlyEndTypes;
+    public TokenTypeDefinition getEndType(){
+        return endTypeDefinition;
     }
-
-    private boolean add(Set<LexicalCategory> set, LexicalCategory...types){
-        boolean changed = false;
-        if(types != null){
-            for(LexicalCategory type : types){
-                if(type != null){
-                    if(set.add(type)){
-                        changed = true;
+    
+    @Override
+    public String toString() {
+    	return phraseType.name();
+    }
+    
+    public static class TokenTypeDefinition {
+        
+        private final Set<LexicalCategory> categories = EnumSet.noneOf(LexicalCategory.class);
+        private Set<Pos> posTags = EnumSet.noneOf(Pos.class);
+        private Set<Pos> excludedPosTags = EnumSet.noneOf(Pos.class);
+        private Set<String> tags = new HashSet<String>();
+        
+        /**
+         * Used by the constructor of the {@link PhraseTypeDefinition} class
+         * @param lc
+         */
+        private TokenTypeDefinition(LexicalCategory lc){
+            this(Collections.singleton(lc),null);
+        }
+        
+        public TokenTypeDefinition(Set<LexicalCategory> categories, Set<Pos> posTags, String...tags) {
+            if(categories != null){
+                for(LexicalCategory lc : categories){
+                    if(lc != null){
+                        this.categories.add(lc);
+                    }
+                }
+            }
+            if(posTags != null){
+                for(Pos pos : posTags){
+                    if(pos != null){
+                        this.posTags.add(pos);
+                    }
+                }
+            }
+            if(tags != null){
+                for(String tag : tags){
+                    if(tag != null){
+                        this.tags.add(tag);
                     }
                 }
             }
         }
-        return changed;
-    }
-    
-    private boolean remove(Set<LexicalCategory> set, LexicalCategory...types){
-        boolean changed = false;
-        if(types != null){
-            for(LexicalCategory type : types){
-                if(type != null){
-                    if(set.remove(type)){
-                        changed = true;
+        /**
+         * Read-/writeable set of {@link LexicalCategory LexicalCategories}
+         * @return the set of lexical categories
+         */
+        public Set<LexicalCategory> getCategories() {
+            return categories;
+        }
+        /**
+         * Adds the parsed {@link LexicalCategory LexicalCategories}
+         * @param categories the LexicalCategories
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean addCategories(LexicalCategory...categories){
+            return add(this.categories, categories);
+        }
+        
+        /**
+         * Removes the parsed {@link LexicalCategory LexicalCategories}
+         * @param categories the LexicalCategories
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean removeCategories(LexicalCategory...categories){
+            return remove(this.categories, categories);
+        }
+        
+        /**
+         * Read-/writeable set of {@link Pos} tags
+         * @return the set of POS tags
+         */
+        public Set<Pos> getPosTags() {
+            return posTags;
+        }
+        
+        /**
+         * Adds the parsed {@link Pos} tags
+         * @param pos the {@link Pos} tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean addPosTags(Pos...pos){
+            return add(this.posTags, pos);
+        }
+        
+        /**
+         * Removes the parsed {@link Pos} tags
+         * @param pos the {@link Pos} tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean removePosTags(Pos...pos){
+            return remove(this.posTags, pos);
+        }
+        
+        /**
+         * Read-/writeable set of excluded {@link Pos} tags. This allows to
+         * include a {@link LexicalCategory} but to exclude some specific 
+         * {@link Pos} member of this category.
+         * @return the set of excluded POS tags
+         */
+        public Set<Pos> getExcludedPosTags() {
+            return excludedPosTags;
+        }
+        
+        /**
+         * Adds the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
+         * @param pos the {@link Pos} tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean addExcludedPosTags(Pos...pos){
+            return add(this.excludedPosTags, pos);
+        }
+        
+        /**
+         * Removes the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
+         * @param pos the {@link Pos} tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean removeExcludedPosTags(Pos...pos){
+            return remove(this.excludedPosTags, pos);
+        }
+        /**
+         * Read-/writeable set of string tags (as provided by the POS tagger)
+         * @return the set of String tags
+         */
+        public Set<String> getTags() {
+            return tags;
+        }
+        /**
+         * Adds the parsed tags
+         * @param tag the tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean addTags(String...tag){
+            return add(this.tags, tag);
+        }
+        
+        /**
+         * Removes the parsed tags
+         * @param tag the tags
+         * @return if the {@link TokenTypeDefinition} was updated by this operation
+         */
+        public boolean removeTags(String...tag){
+            return remove(this.tags, tag);
+        }
+        
+        /**
+         * Checks if a posTag matches against this TokenTypeDefinition
+         * @param posTag the posTag to check
+         * @return <code>true</code> in case of a match. Otherwise <code>false</code>
+         * @throws NullPointerException if the parsed posTag is <code>null</code>
+         */
+        public boolean matches(PosTag posTag){
+            //check against incldues categories, posTags and tags
+            boolean matches = 
+                    (!Collections.disjoint(posTag.getCategories(), categories)) ||
+                    (!Collections.disjoint(posTag.getPosHierarchy(), posTags)) ||
+                    tags.contains(posTag.getTag());
+            //if there is a match we need still to check for excluded POS tags
+            return matches ? Collections.disjoint(posTag.getPosHierarchy(),excludedPosTags) :
+                false;
+        }
+        
+        private <T> boolean add(Set<T> set, T...types){
+            boolean changed = false;
+            if(types != null){
+                for(T type : types){
+                    if(type != null){
+                        if(set.add(type)){
+                            changed = true;
+                        }
                     }
                 }
             }
+            return changed;
+        }
+        
+        private <T> boolean remove(Set<T> set, T...types){
+            boolean changed = false;
+            if(types != null){
+                for(T type : types){
+                    if(type != null){
+                        if(set.remove(type)){
+                            changed = true;
+                        }
+                    }
+                }
+            }
+            return changed;
+        }
+        
+        @Override
+        public String toString() {
+            StringBuilder sb = new StringBuilder();
+            if(!categories.isEmpty()){
+                sb.append("Cat: ");
+                boolean first = true;
+                for(LexicalCategory lc : categories){
+                    if(first){
+                        first = false;
+                    } else {
+                        sb.append(", ");
+                    }
+                    sb.append(lc.name());
+                }
+            }
+            if(!posTags.isEmpty()){
+                if(sb.length() > 0){
+                    sb.append(" | ");
+                }
+                sb.append("Pos: ");
+                boolean first = true;
+                for(Pos pos : posTags){
+                    if(first){
+                        first = false;
+                    } else {
+                        sb.append(", ");
+                    }
+                    sb.append(pos.name());
+                }
+            }
+            if(!tags.isEmpty()){
+                if(sb.length() > 0){
+                    sb.append(" | ");
+                }
+                sb.append("Tags: ");
+                boolean first = true;
+                for(String tag : tags){
+                    if(first){
+                        first = false;
+                    } else {
+                        sb.append(", ");
+                    }
+                    sb.append(tag);
+                }
+            }
+            if(!excludedPosTags.isEmpty()){
+                if(sb.length() > 0){
+                    sb.append(" | ");
+                }
+                sb.append("Excluded: ");
+                boolean first = true;
+                for(Pos pos : excludedPosTags){
+                    if(first){
+                        first = false;
+                    } else {
+                        sb.append(", ");
+                    }
+                    sb.append(pos.name());
+                }
+            }
+            return sb.toString();
         }
-        return changed;
     }
     
-    @Override
-    public String toString() {
-    	return phraseType.name();
-    }
 }

Modified: stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java?rev=1571145&r1=1571144&r2=1571145&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/pos-chunker/src/main/java/org/apache/stanbol/enhancer/engines/poschunker/engine/PosChunkerEngine.java Mon Feb 24 05:12:03 2014
@@ -52,6 +52,7 @@ import org.apache.stanbol.enhancer.nlp.m
 import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
 import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
 import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -126,22 +127,29 @@ public class PosChunkerEngine extends Ab
     //TODO: make configurable
     static {
         PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
+        //NOTE: Pos.Acronym, Pos.Abbreviation, Pos.Foreign are also considered as
+        //      nouns by this definition.
+        nounPD.getRequiredType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
         //start types noun (automatically included) pronoun or determiners, adjectives 
-        nounPD.addStartType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getStartType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getStartType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
         //prefix types are the same as start types (e.g. "the nice trip")
-        nounPD.addPrefixType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getPrefixType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getPrefixType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
         //continuation types are nouns and punctations. 
         //NOTE: Adverbs are excluded to avoid phrases like "the nice trip last week"
-        nounPD.addContinuationType(LexicalCategory.Punctuation);
+        nounPD.getContinuationType().addCategories(LexicalCategory.Punctuation);
+        nounPD.getContinuationType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
         //end types are the same as start terms
-        nounPD.addEndType(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getEndType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
+        nounPD.getEndType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
         //and required types do include a Noun (what is actually included by default)
         NOUN_PHRASE_TYPE = nounPD;
 
         PhraseTypeDefinition verbPD = new PhraseTypeDefinition(LexicalCategory.Verb);
-        verbPD.addStartType(LexicalCategory.Adverb);
-        verbPD.addContinuationType(LexicalCategory.Adverb,LexicalCategory.Punctuation);
-        verbPD.addEndType(LexicalCategory.Adverb);
+        verbPD.getStartType().addCategories(LexicalCategory.Adverb);
+        verbPD.getContinuationType().addCategories(LexicalCategory.Adverb,LexicalCategory.Punctuation);
+        verbPD.getEndType().addCategories(LexicalCategory.Adverb);
         //and required types do include a Verbs (what is actually included by default)
         VERB_PHRASE_TYPE = verbPD;
     }