You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/08/23 11:22:54 UTC

svn commit: r1516775 - in /stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl: ChunkData.java EntityLinker.java ProcessingState.java SectionData.java TokenData.java

Author: rwesten
Date: Fri Aug 23 09:22:53 2013
New Revision: 1516775

URL: http://svn.apache.org/r1516775
Log:
STANBOL-1128: Refactorings to the internal API of the EntityLinkingEngine required for reusage in the FST Linking Engine

Added:
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
Modified:
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java

Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ChunkData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,108 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+
+/** 
+ * Represents a Chunk (group of tokens) used as context for EntityLinking.
+ * Typically a single {@link ChunkData#chunk} is used, but in case of
+ * overlapping and {@link ChunkData#isProcessable processable} chunks
+ * multiple {@link Chunk}s might be merged to a single {@link ChunkData}
+ * instance. In such cases {@link ChunkData#chunk} represents the
+ * first and {@link ChunkData#merged} the last of the merged chunks.<p>
+ * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent
+ * the covered [start,end) {@link Token} indices relative to the current
+ * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()}
+ * and {@link ChunkData#getEndChar()} are the absolute [start,end) character
+ * indices within the {@link AnalysedText#getSpan()}
+ */
+public class ChunkData {
+    protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
+    /** if the Chunk is processable */
+    public final boolean isProcessable;
+    /** the Chunk */
+    public final Chunk chunk;
+    /** 
+     * In case multiple overlapping and processable {@link Chunk}s the
+     * section selected by the chunks are merged. While {@link #chunk}
+     * holds the original chunk (the first) this variable holds the
+     * last merged one. Enclosed chunks (in case more than two are
+     * merged) are not available via this class, but can be retrieved
+     * by iterating over the {@link AnalysedText} content part.
+     */
+    Chunk merged;
+    /** the start token index relative to the current section (sentence) */
+    int startToken;
+    /** the end token index relative to the current section (sentence) */
+    int endToken;
+    /**
+     * The number of matchable Tokens enclosed by this Chunk
+     */
+    int matchableCount;
+    /**
+     * constructs and initializes the meta data for the parsed {@link Chunk}
+     * @param chunk
+     */
+    public ChunkData(LanguageProcessingConfig tpc, Chunk chunk){
+        this.chunk = chunk;
+        Boolean process = null;
+        for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) {
+            if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
+                || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
+                if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                        phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) {
+                    process = true;
+                    break;
+                } // else probability to low for inclusion
+            } else if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                    phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) {
+                process = false;
+                break;
+            } // else probability to low for exclusion
+        }
+        isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
+    }
+    /**
+     * Getter for the start character position
+     * @return the start character position of the selected text span.
+     */
+    public int getStartChar(){
+        return chunk.getStart();
+    }
+    /**
+     * Getter for the end character position of the text selected by
+     * possible multiple {@link #merged} chunks.
+     * @return the end character position considering possible {@link #merged}
+     * chunks.
+     */
+    public int getEndChar(){
+        return merged == null ? chunk.getEnd() : merged.getEnd();
+    }
+    /**
+     * If this chunk is processable
+     * @return the state
+     */
+    public boolean isProcessable() {
+        return isProcessable;
+    }
+    /**
+     * Getter for the number of matchable tokens contained in this chunk
+     * @return The number of matchable tokens contained in this chunk
+     */
+    public int getMatchableCount() {
+        return matchableCount;
+    }
+    public int getStartTokenIndex() {
+        return startToken;
+    }
+    public int getEndTokenIndex() {
+        return endToken;
+    }
+}
\ No newline at end of file

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1516775&r1=1516774&r2=1516775&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Fri Aug 23 09:22:53 2013
@@ -43,7 +43,6 @@ import org.apache.stanbol.enhancer.engin
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData;
 import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -351,7 +350,7 @@ public class EntityLinker {
                 LinkedEntity linkedEntity = linkedEntities.get(selectedText);
                 if(linkedEntity == null){
                     linkedEntity = new LinkedEntity(selectedText,
-                        suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+                        suggestions, getLinkedEntityTypes(suggestions));
                     linkedEntities.put(selectedText, linkedEntity);
                 } // else Assumption: The list of suggestions is the SAME
                 linkedEntity.addOccurrence(state.getSentence(), 
@@ -374,7 +373,7 @@ public class EntityLinker {
                     linkedEntity = linkedEntities.get(selectedText);
                     if(linkedEntity == null){
                         linkedEntity = new LinkedEntity(selectedText,
-                            partialMatches, getLinkedEntityTypes(suggestions.subList(0, 1)));
+                            partialMatches, getLinkedEntityTypes(suggestions));
                         linkedEntities.put(selectedText, linkedEntity);
                     } // else Assumption: The list of suggestions is the SAME
                     linkedEntity.addOccurrence(state.getSentence(), 
@@ -486,7 +485,12 @@ public class EntityLinker {
      */
     private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
         Collection<UriRef> conceptTypes = new HashSet<UriRef>();
+        double score = -1; //only consider types of the best ranked Entities
         for(Suggestion suggestion : suggestions){
+            double actScore = suggestion.getScore();
+            if(actScore < score){
+                break;
+            }
             for(Iterator<UriRef> types = 
                 suggestion.getEntity().getReferences(linkerConfig.getTypeField()); 
                 types.hasNext();conceptTypes.add(types.next()));

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1516775&r1=1516774&r2=1516775&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Fri Aug 23 09:22:53 2013
@@ -19,9 +19,7 @@
  */
 package org.apache.stanbol.enhancer.engines.entitylinking.impl;
 
-import static java.util.Collections.disjoint;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.UNICASE_SCRIPT_LANUAGES;
-import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
 import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
 
 import java.util.ArrayList;
@@ -35,9 +33,8 @@ import java.util.Locale;
 import org.apache.commons.collections.Predicate;
 import org.apache.commons.collections.iterators.FilterIterator;
 import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
-import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
 import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
-import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
 import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
 import org.apache.stanbol.enhancer.nlp.model.Chunk;
 import org.apache.stanbol.enhancer.nlp.model.Section;
@@ -45,11 +42,7 @@ import org.apache.stanbol.enhancer.nlp.m
 import org.apache.stanbol.enhancer.nlp.model.Span;
 import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
 import org.apache.stanbol.enhancer.nlp.model.Token;
-import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
-import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
-import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
 import org.apache.stanbol.enhancer.nlp.pos.Pos;
-import org.apache.stanbol.enhancer.nlp.pos.PosTag;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -251,149 +244,11 @@ public class ProcessingState {
                 continue; //ignore this section
             }
             consumedSectionIndex = section.getEnd();
-            tokens.clear(); //clear token for each section (STANBOL-818)
-            Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
-            ChunkData activeChunk = null;
-            while(enclosed.hasNext()){
-                Span span = enclosed.next();
-                if(span.getStart() >= span.getEnd()){ //save guard against empty spans
-                    log.warn("Detected Empty Span {} in section {} of Blob {}",
-                        new Object[]{span,section, at.getBlob()});
-                }
-                if(span.getType() == SpanTypeEnum.Chunk){
-                    ChunkData chunkData = new ChunkData((Chunk)span);
-                    if(chunkData.isProcessable()){
-                        if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
-                            if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
-                                log.info("   - merge overlapping and processable Chunks {} <-> {}",
-                                    activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
-                                activeChunk.merged = (Chunk)span; //set this one as last merged
-                            } //ignore completely covered chunks
-                        } else { // a new Chunk starts
-                            activeChunk = chunkData;
-                            activeChunk.startToken = tokens.size();
-                            if(log.isDebugEnabled()){
-                                log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
-                                    new Object []{
-                                        activeChunk.chunk.getType(),
-                                        activeChunk.startToken,
-                                        activeChunk.chunk.getSpan()
-                                    });
-                            }
-                        } 
-                    } //else ignore chunks that are not processable
-                } else if(span.getType() == SpanTypeEnum.Token){
-                    TokenData tokenData = new TokenData(tokens.size(),(Token)span,activeChunk);
-                    if(log.isDebugEnabled()){
-                        log.debug("  > {}: {} {}(pos:{}) chunk: '{}'",
-                            new Object[]{tokenData.index,tokenData.token,
-                                tokenData.morpho != null ? ("(lemma: "+tokenData.morpho.getLemma()+") ") : "",
-                                tokenData.token.getAnnotations(POS_ANNOTATION),
-                                tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none"});
-                    }
-                    if(!tokenData.hasAlphaNumeric){
-                        tokenData.isLinkable = false;
-                        tokenData.isMatchable = false;
-                    } else {
-                        // (1) apply basic rules for linkable/processable tokens
-                        //determine if the token should be linked/matched
-                        tokenData.isLinkable = tokenData.isLinkablePos != null ? tokenData.isLinkablePos : false;
-                        //matchabel := linkable OR has matchablePos
-                        tokenData.isMatchable = tokenData.isLinkable || 
-                                (tokenData.isMatchablePos != null && tokenData.isMatchablePos);
-                        
-                        //(2) for non linkable tokens check for upper case rules
-                        if(!tokenData.isLinkable && tokenData.upperCase && 
-                                tokenData.index > 0 && //not a sentence or sub-sentence start
-                                !tokens.get(tokenData.index-1).isSubSentenceStart){
-                            //We have an upper case token!
-                            if(tpc.isLinkUpperCaseTokens()){
-                                if(tokenData.isMatchable) { //convert matchable to 
-                                    tokenData.isLinkable = true; //linkable
-                                    tokenData.isMatchable = true;
-                                } else { // and other tokens to
-                                    tokenData.isMatchable = true; //matchable
-                                }
-                            } else { 
-                                //finally we need to convert other Tokens to matchable
-                                //if MatchUpperCaseTokens is active
-                                if(!tokenData.isMatchable && tpc.isMatchUpperCaseTokens()){
-                                    tokenData.isMatchable = true;
-                                }
-                            }
-                        } //else not an upper case token
-                        
-                        //(3) Unknown POS tag Rules (see STANBOL-1049)
-                        if(!tokenData.isLinkable && (tokenData.isLinkablePos == null || 
-                                tokenData.isMatchablePos == null)){
-                            if(isUnicaseLanguage || !tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
-                                if(tokenData.isLinkablePos == null && tokenData.hasSearchableLength){
-                                    tokenData.isLinkable = true;
-                                    tokenData.isMatchable = true;
-                                } //else no need to change the state
-                            } else { //non unicase language and link only upper case tokens enabled
-                                if(tokenData.upperCase && // upper case token
-                                        tokenData.index > 0 && //not a sentence or sub-sentence start
-                                        !tokens.get(tokenData.index-1).isSubSentenceStart){
-                                    if(tokenData.hasSearchableLength && tokenData.isLinkablePos == null){
-                                        tokenData.isLinkable = true;
-                                        tokenData.isMatchable = true;
-                                    } else if(tokenData.isMatchablePos == null){
-                                        tokenData.isMatchable = true;
-                                    }
-                                } else if(tokenData.hasSearchableLength &&  //lower case and long token
-                                        tokenData.isMatchablePos == null){ 
-                                    tokenData.isMatchable = true;
-                                } //else lower case and short word 
-                            }
-                        } //else already linkable or POS tag present
-                    }
-                    log.debug("    - {}",tokenData); 
-                    //add the token to the list
-                    tokens.add(tokenData);
-                    if(!foundLinkableToken){
-                        foundLinkableToken = tokenData.isLinkable;
-                    }
-                    if(activeChunk != null){
-                        if (tokenData.isLinkable){
-                            //ignore matchableCount in Chunks with linkable Tokens
-                            activeChunk.matchableCount = -10; //by setting the count to -10
-                        } else if(tokenData.isMatchable){
-                            activeChunk.matchableCount++;
-                        }
-                        if (span.getEnd() >= activeChunk.getEndChar()){
-                            //this is the last token in the current chunk
-                            activeChunk.endToken = tokens.size()-1;
-                            log.debug("   - end Chunk@pos: {}", activeChunk.endToken);
-                            if(tpc.isLinkMultiMatchableTokensInChunk() && 
-                                    activeChunk.getMatchableCount() > 1 ){
-                                log.debug("   - multi-matchable Chunk:");
-                                //mark the last of two immediate following matchable
-                                //tokens as processable
-                                for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){
-                                    TokenData ct = tokens.get(i);
-                                    TokenData pt = tokens.get(i-1);
-                                    if(ct.isMatchable && pt.isMatchable){
-                                        if(!ct.isLinkable) { //if not already processable
-                                            log.debug("     > convert Token {}: {} (pos:{}) from matchable to processable",
-                                                new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
-                                            ct.isLinkable = true;
-                                            if(!foundLinkableToken){
-                                                foundLinkableToken = true;
-                                            }
-                                        }
-                                        i--;//mark both (ct & pt) as processed
-                                    }
-                                }
-                            }
-                            activeChunk = null;
-                        }
-                    }
-                }
-            }
-            if(activeChunk != null) { //close the last chunk (if not done)
-                activeChunk.endToken = tokens.size()-1;
-            }
+            SectionData sectionData = new SectionData(tpc, section, enclosedSpanTypes, foundLinkableToken);
+            //TODO: It would be better to use a SectionData field instead
+            tokens = sectionData.getTokens();
+            section = sectionData.section;
+            foundLinkableToken = sectionData.hasLinkableToken();
         }
         processableTokensIterator = new FilterIterator(tokens.iterator(), PROCESSABLE_TOKEN_OREDICATE);
         return foundLinkableToken;
@@ -421,71 +276,6 @@ public class ProcessingState {
             tokens.get(start+(tokenCount-1)).token.getEnd()-offset);
     }
     
-//    /**
-
-//     */
-//    protected boolean getProcessablePosTag(Token token) {
-//        for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
-//            // check three possible match
-//            //  1. the LexicalCategory matches
-//            //  2. the Pos matches
-//            //  3. the String tag matches
-//            PosTag posTag = posAnnotation.value();
-////            log.debug("   ... check PosAnntation {} (lc:{}|pos:{}|tag:{}",
-////                new Object[]{posAnnotation,posTag.getCategories(),
-////                             posTag.getPosHierarch(),posTag.getTag()});
-//            if((!Collections.disjoint(tpc.getProcessedLexicalCategories(), 
-//                    posTag.getCategories())) ||
-//                (!Collections.disjoint(tpc.getProcessedPos(),
-//                    posTag.getPosHierarchy())) ||
-//                tpc.getProcessedPosTags().contains(
-//                    posTag.getTag())){
-//                if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-//                    return true;
-//                } // else probability to low for inclusion
-//            } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-//                return false;
-//            } // else probability to low for exclusion
-//        }
-//        return token.getSpan().length() >= elc.getMinSearchTokenLength();
-//    }
-
-// Both
-//    protected boolean isMatchableToken(Token token){
-//        for(Value<PosTag> posAnnotation : token.getAnnotations(POS_ANNOTATION)){
-//            PosTag posTag = posAnnotation.value();
-//            if(posTag.isMapped()){
-//                if(!Collections.disjoint(tpc.getMatchableLexicalCategories(), 
-//                    posTag.getCategories())){
-//                    if(posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-//                        return true;
-//                    } // else probability to low for inclusion
-//                } else if(posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-//                    return false;
-//                } // else probability to low for exclusion
-//            } //else not matched ... search next one
-//        }
-//        return token.getSpan().length() >= elc.getMinSearchTokenLength();        
-//    }
-//    
-//    
-//    protected boolean isProcesableChunk(Chunk chunk){
-//        for(Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)){
-//            if(tpc.getProcessedPhraseCategories().contains(
-//                phraseAnnotation.value().getCategory()) ||
-//                tpc.getProcessedPhraseTags().contains(
-//                    phraseAnnotation.value().getTag())){
-//                if(phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()){
-//                    return true;
-//                } // else probability to low for inclusion
-//            } else if(phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()){
-//                return false;
-//            } // else probability to low for exclusion
-//        }
-//        //neither a clear accept/reject ...
-//        return true;
-//    }
-    
     @Override
     public String toString() {
         StringBuilder sb = new StringBuilder();
@@ -510,329 +300,4 @@ public class ProcessingState {
         return sb.toString();
     }
     
-    /**
-     * Internally used to store additional Metadata for Tokens of the current Sentence
-     * <p>
-     * Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
-     * annotation ( Lexical Category, POS tag) and second on the
-     * {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
-     * probability of the POS annotations is to low.
-     * <p>
-     * Since STANBOL-685two POS Probabilities are used <ul>
-     * <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
-     * processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
-     * {@link LanguageProcessingConfig#getLinkedPosTags()}.
-     * <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
-     * processed. By default the exclusion probability is set to half of the inclusion one.
-     * </ul>
-     * Assuming that the <code>minPosTypePropb=0.667</code> a
-     * <ul>
-     * <li>noun with the prop 0.8 would result in returning <code>true</code>
-     * <li>noun with prop 0.5 would return <code>null</code>
-     * <li>verb with prop 0.4 would return <code>false</code>
-     * <li>verb with prop 0.3 would return <code>null</code>
-     * </ul>
-     * This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
-     * to be used as fallback for Tokens (what typically still provides better estimations as the token
-     * length).
-     * <p>
-     * (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
-     * Stanbol NLP processing chain)
-     * 
-     * @param token
-     *            the {@link Token} to check.
-     * @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
-     */
-    public class TokenData {
-        /** The Token */
-        public final Token token;
-        /** The index of the Token within the current Section (Sentence) */
-        public final int index;
-        /** If this Token should be linked with the Vocabulary */
-        public boolean isLinkable;
-        /** If this Token should be used for multi word searches in the Vocabulary */
-        public boolean isMatchable;
-        /** if this Token has an alpha or numeric char */
-        public final boolean hasAlphaNumeric;
-        /** the chunk of this Token */
-        public final ChunkData inChunk;
-        /** the morphological features of the Token (selected based on the POS Tag) */
-        public final MorphoFeatures morpho;
-        /**
-         * if this token starts with an upperCase letter
-         */
-        public final boolean upperCase;
-        /**
-         * if the length of the token is &gt;= {@link LanguageProcessingConfig#getMinSearchTokenLength()}
-         */
-        public boolean hasSearchableLength;
-        /**
-         * If the POS type of this word matches a linkable category
-         */
-        public final Boolean isLinkablePos;
-        /**
-         * if the POS type of this word matches a matchable category
-         */
-        public final Boolean isMatchablePos;
-        /**
-         * if this Token represents the start of an sub-sentence such as an 
-         * starting ending quote 
-         * @see ProcessingState#SUB_SENTENCE_START_POS
-         */
-        public final boolean isSubSentenceStart;
-        /**
-         * Constructs and initializes meta data needed for linking based 
-         * on the current tokens (and its NLP annotation)
-         * @param index the index of the Token within the current section
-         * @param token the token
-         * @param chunk the current chunk or <code>null</code> if none
-         */
-        TokenData(int index,Token token, ChunkData chunk) {
-            //(0) init fields
-            this.token = token;
-            this.index = index;
-            this.inChunk = chunk;
-            this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
-            this.hasSearchableLength = token.getSpan().length() >= tpc.getMinSearchTokenLength();
-            PosTag selectedPosTag = null;
-            boolean matchedPosTag = false; //matched any of the POS annotations
-            
-            //(1) check if this Token should be linked against the Vocabulary (isProcessable)
-            upperCase = token.getEnd() > token.getStart() && //not an empty token
-                    Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
-            boolean isLinkablePos = false;
-            boolean isMatchablePos = false;
-            boolean isSubSentenceStart = false;
-            List<Value<PosTag>> posAnnotations = token.getAnnotations(POS_ANNOTATION);
-            for(Value<PosTag> posAnnotation : posAnnotations){
-                // check three possible match
-                //  1. the LexicalCategory matches
-                //  2. the Pos matches
-                //  3. the String tag matches
-                PosTag posTag = posAnnotation.value();
-                if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
-                        (!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
-                        tpc.getLinkedPosTags().contains(posTag.getTag())){
-                    if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                            posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-                        selectedPosTag = posTag;
-                        isLinkablePos = true;
-                        isMatchablePos = true;
-                        matchedPosTag = true;
-                        break;
-                    } // else probability to low for inclusion
-                } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                        posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-                    selectedPosTag = posTag; //also rejected PosTags are selected
-                    matchedPosTag = true;
-                    isLinkablePos = false;
-                    break;
-                } // else probability to low for exclusion
-            }
-            if(!matchedPosTag) { //not matched against a POS Tag ...
-                this.isLinkablePos = null;
-            } else {
-                this.isLinkablePos = isLinkablePos;
-            }
-            
-            //(2) check if this token should be considered to match labels of suggestions
-            if(this.isLinkablePos != null && this.isLinkablePos){ //processable tokens are also matchable
-                this.isMatchablePos = true;
-            } else { //check POS and length to see if token is matchable
-                matchedPosTag = false; //reset to false!
-                for(Value<PosTag> posAnnotation : posAnnotations){
-                    PosTag posTag = posAnnotation.value();
-                    if(posTag.isMapped()){
-                        if(!Collections.disjoint(tpc.getMatchedLexicalCategories(), 
-                            posTag.getCategories())){
-                            if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                                    posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-                                //override selectedPosTag if present
-                                selectedPosTag = posTag; //mark the matchable as selected PosTag
-                                isMatchablePos = true;
-                                matchedPosTag = true;
-                                break;
-                            } // else probability to low for inclusion
-                        } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                                posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-                            if(selectedPosTag == null){ //do not override existing values
-                                selectedPosTag = posTag; //also rejected PosTags are selected
-                            }
-                            isMatchablePos = false;
-                            matchedPosTag = true;
-                            break;
-                        } // else probability to low for exclusion
-                    } //else not matched ... search next one
-                }
-                if(!matchedPosTag){ //not matched against POS tag ...
-                    //fall back to the token length
-                    this.isMatchablePos = null;
-                    //this.isMatchablePos = token.getSpan().length() >= tpc.getMinSearchTokenLength();    
-                } else {
-                    this.isMatchablePos = isMatchablePos;
-                }
-            }
-            //(3) check if the POS tag indicates the start/end of an sub-sentence
-            for(Value<PosTag> posAnnotation : posAnnotations){
-                PosTag posTag = posAnnotation.value();
-                if((!disjoint(SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
-                    if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                            posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
-                        isSubSentenceStart = true;
-                    } // else probability to low for inclusion
-                } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                        posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
-                    isSubSentenceStart = false;
-                }
-            }
-            this.isSubSentenceStart = isSubSentenceStart;
-            
-            //(4) check for morpho analyses
-            if(selectedPosTag == null){ //token is not processable or matchable
-                //we need to set the selectedPoas tag to the first POS annotation
-                Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
-                if(posAnnotation != null) {
-                    selectedPosTag = posAnnotation.value();
-                }
-            }
-            List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
-            if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
-                morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
-            } else { //select the correct morpho annotation based on the POS tag
-                MorphoFeatures mf = null;
-                selectMorphoFeature : 
-                for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
-                    for(PosTag posTag : morphoAnnotation.value().getPosList()){
-                        if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
-                            mf = morphoAnnotation.value();
-                            break selectMorphoFeature; //stop after finding the first one
-                        }
-                    }
-                }
-                morpho = mf;
-            }
-            
-        }
-        
-        /**
-         * Getter for token text
-         * @return the text of the token
-         */
-        public String getTokenText(){
-            return token.getSpan();
-        }
-        /**
-         * Getter for the Lemma of the token. 
-         * @return the Lemma of the Token or <code>null</code> if not available
-         */
-        public String getTokenLemma(){
-            return morpho != null ? morpho.getLemma() : null;
-        }
-        @Override
-        public String toString() {
-            return new StringBuilder("TokenData: '").append(getTokenText())
-                    .append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
-                    .append(")| matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
-                    .append(")| alpha=").append(hasAlphaNumeric).append("| seachLength=")
-                    .append(hasSearchableLength).append("| upperCase=").append(upperCase)
-                    .append("]").toString();
-        }  
-    }
-    /** 
-     * Represents a Chunk (group of tokens) used as context for EntityLinking.
-     * Typically a single {@link ChunkData#chunk} is used, but in case of
-     * overlapping and {@link ChunkData#isProcessable processable} chunks
-     * multiple {@link Chunk}s might be merged to a single {@link ChunkData}
-     * instance. In such cases {@link ChunkData#chunk} represents the
-     * first and {@link ChunkData#merged} the last of the merged chunks.<p>
-     * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent
-     * the covered [start,end) {@link Token} indices relative to the current
-     * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()}
-     * and {@link ChunkData#getEndChar()} are the absolute [start,end) character
-     * indices within the {@link AnalysedText#getSpan()}
-     */
-    public class ChunkData {
-        protected final static boolean DEFAULT_PROCESSABLE_STATE = true;
-        /** if the Chunk is processable */
-        public final boolean isProcessable;
-        /** the Chunk */
-        public final Chunk chunk;
-        /** 
-         * In case multiple overlapping and processable {@link Chunk}s the
-         * section selected by the chunks are merged. While {@link #chunk}
-         * holds the original chunk (the first) this variable holds the
-         * last merged one. Enclosed chunks (in case more than two are
-         * merged) are not available via this class, but can be retrieved
-         * by iterating over the {@link AnalysedText} content part.
-         */
-        private Chunk merged;
-        /** the start token index relative to the current section (sentence) */
-        private int startToken;
-        /** the end token index relative to the current section (sentence) */
-        private int endToken;
-        /**
-         * The number of matchable Tokens enclosed by this Chunk
-         */
-        int matchableCount;
-        /**
-         * constructs and initializes the meta data for the parsed {@link Chunk}
-         * @param chunk
-         */
-        ChunkData(Chunk chunk){
-            this.chunk = chunk;
-            Boolean process = null;
-            for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) {
-                if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory())
-                    || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) {
-                    if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                            phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) {
-                        process = true;
-                        break;
-                    } // else probability to low for inclusion
-                } else if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
-                        phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) {
-                    process = false;
-                    break;
-                } // else probability to low for exclusion
-            }
-            isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process;
-        }
-        /**
-         * Getter for the start character position
-         * @return the start character position of the selected text span.
-         */
-        public int getStartChar(){
-            return chunk.getStart();
-        }
-        /**
-         * Getter for the end character position of the text selected by
-         * possible multiple {@link #merged} chunks.
-         * @return the end character position considering possible {@link #merged}
-         * chunks.
-         */
-        public int getEndChar(){
-            return merged == null ? chunk.getEnd() : merged.getEnd();
-        }
-        /**
-         * If this chunk is processable
-         * @return the state
-         */
-        public boolean isProcessable() {
-            return isProcessable;
-        }
-        /**
-         * Getter for the number of matchable tokens contained in this chunk
-         * @return The number of matchable tokens contained in this chunk
-         */
-        public int getMatchableCount() {
-            return matchableCount;
-        }
-        public int getStartTokenIndex() {
-            return startToken;
-        }
-        public int getEndTokenIndex() {
-            return endToken;
-        }
-    }
-    
 }
\ No newline at end of file

Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/SectionData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,188 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class SectionData {
+
+    private static final Logger log = LoggerFactory.getLogger(SectionData.class);
+    
+    /**
+     * The section
+     */
+    public final Section section;
+    /**
+     * Holds the {@link Token}s of the current {@link #sentence} 
+     * to allow fast index based access.
+     */
+    private List<TokenData> tokens = new ArrayList<TokenData>(64);
+    /**
+     * If a linkable token is present in this section
+     */
+    private boolean hasLinkableToken = false;
+
+    public SectionData(LanguageProcessingConfig tpc, Section section, 
+            Set<SpanTypeEnum> enclosedSpanTypes, boolean isUnicaseLanguage){
+        this.section = section;
+        Iterator<Span> enclosed = section.getEnclosed(enclosedSpanTypes);
+        ChunkData activeChunk = null;
+        while(enclosed.hasNext()){
+            Span span = enclosed.next();
+            if(span.getStart() >= span.getEnd()){ //save guard against empty spans
+                log.warn("Detected Empty Span {} in section {}: '{}'",
+                    new Object[]{span,section, section.getSpan()});
+            }
+            if(span.getType() == SpanTypeEnum.Chunk){
+                ChunkData chunkData = new ChunkData(tpc,(Chunk)span);
+                if(chunkData.isProcessable()){
+                    if(activeChunk != null){ //current Chunk not yet closed -> overlapping chunks!
+                        if(activeChunk.getEndChar() < span.getEnd()){ //merge partly overlapping chunks
+                            log.info("   - merge overlapping and processable Chunks {} <-> {}",
+                                activeChunk.merged == null? activeChunk.chunk : activeChunk.merged,span);
+                            activeChunk.merged = (Chunk)span; //set this one as last merged
+                        } //ignore completely covered chunks
+                    } else { // a new Chunk starts
+                        activeChunk = chunkData;
+                        activeChunk.startToken = tokens.size();
+                        if(log.isDebugEnabled()){
+                            log.debug(">> Chunk: (type:{}, startPos: {}) text: '{}'",
+                                new Object []{
+                                    activeChunk.chunk.getType(),
+                                    activeChunk.startToken,
+                                    activeChunk.chunk.getSpan()
+                                });
+                        }
+                    } 
+                } //else ignore chunks that are not processable
+            } else if(span.getType() == SpanTypeEnum.Token){
+                TokenData tokenData = new TokenData(tpc,tokens.size(),(Token)span,activeChunk);
+                if(log.isDebugEnabled()){
+                    log.debug("  > {}: {} {}(pos:{}) chunk: '{}'",
+                        new Object[]{tokenData.index,tokenData.token,
+                            tokenData.morpho != null ? ("(lemma: "+tokenData.morpho.getLemma()+") ") : "",
+                            tokenData.token.getAnnotations(POS_ANNOTATION),
+                            tokenData.inChunk != null ? tokenData.inChunk.chunk.getSpan() : "none"});
+                }
+                if(!tokenData.hasAlphaNumeric){
+                    tokenData.isLinkable = false;
+                    tokenData.isMatchable = false;
+                } else {
+                    // (1) apply basic rules for linkable/processable tokens
+                    //determine if the token should be linked/matched
+                    tokenData.isLinkable = tokenData.isLinkablePos != null ? tokenData.isLinkablePos : false;
+                    //matchabel := linkable OR has matchablePos
+                    tokenData.isMatchable = tokenData.isLinkable || 
+                            (tokenData.isMatchablePos != null && tokenData.isMatchablePos);
+                    
+                    //(2) for non linkable tokens check for upper case rules
+                    if(!tokenData.isLinkable && tokenData.upperCase && 
+                            tokenData.index > 0 && //not a sentence or sub-sentence start
+                            !tokens.get(tokenData.index-1).isSubSentenceStart){
+                        //We have an upper case token!
+                        if(tpc.isLinkUpperCaseTokens()){
+                            if(tokenData.isMatchable) { //convert matchable to 
+                                tokenData.isLinkable = true; //linkable
+                                tokenData.isMatchable = true;
+                            } else { // and other tokens to
+                                tokenData.isMatchable = true; //matchable
+                            }
+                        } else { 
+                            //finally we need to convert other Tokens to matchable
+                            //if MatchUpperCaseTokens is active
+                            if(!tokenData.isMatchable && tpc.isMatchUpperCaseTokens()){
+                                tokenData.isMatchable = true;
+                            }
+                        }
+                    } //else not an upper case token
+                    
+                    //(3) Unknown POS tag Rules (see STANBOL-1049)
+                    if(!tokenData.isLinkable && (tokenData.isLinkablePos == null || 
+                            tokenData.isMatchablePos == null)){
+                        if(isUnicaseLanguage || !tpc.isLinkOnlyUpperCaseTokensWithUnknownPos()){
+                            if(tokenData.isLinkablePos == null && tokenData.hasSearchableLength){
+                                tokenData.isLinkable = true;
+                                tokenData.isMatchable = true;
+                            } //else no need to change the state
+                        } else { //non unicase language and link only upper case tokens enabled
+                            if(tokenData.upperCase && // upper case token
+                                    tokenData.index > 0 && //not a sentence or sub-sentence start
+                                    !tokens.get(tokenData.index-1).isSubSentenceStart){
+                                if(tokenData.hasSearchableLength && tokenData.isLinkablePos == null){
+                                    tokenData.isLinkable = true;
+                                    tokenData.isMatchable = true;
+                                } else if(tokenData.isMatchablePos == null){
+                                    tokenData.isMatchable = true;
+                                }
+                            } else if(tokenData.hasSearchableLength &&  //lower case and long token
+                                    tokenData.isMatchablePos == null){ 
+                                tokenData.isMatchable = true;
+                            } //else lower case and short word 
+                        }
+                    } //else already linkable or POS tag present
+                }
+                log.debug("    - {}",tokenData); 
+                //add the token to the list
+                tokens.add(tokenData);
+                if(!hasLinkableToken){
+                    hasLinkableToken = tokenData.isLinkable;
+                }
+                if(activeChunk != null){
+                    if (tokenData.isLinkable){
+                        //ignore matchableCount in Chunks with linkable Tokens
+                        activeChunk.matchableCount = -10; //by setting the count to -10
+                    } else if(tokenData.isMatchable){
+                        activeChunk.matchableCount++;
+                    }
+                    if (span.getEnd() >= activeChunk.getEndChar()){
+                        //this is the last token in the current chunk
+                        activeChunk.endToken = tokens.size()-1;
+                        log.debug("   - end Chunk@pos: {}", activeChunk.endToken);
+                        if(tpc.isLinkMultiMatchableTokensInChunk() && 
+                                activeChunk.getMatchableCount() > 1 ){
+                            log.debug("   - multi-matchable Chunk:");
+                            //mark the last of two immediate following matchable
+                            //tokens as processable
+                            for(int i = activeChunk.endToken-1;i >= activeChunk.startToken+1;i--){
+                                TokenData ct = tokens.get(i);
+                                TokenData pt = tokens.get(i-1);
+                                if(ct.isMatchable && pt.isMatchable){
+                                    if(!ct.isLinkable) { //if not already processable
+                                        log.debug("     > convert Token {}: {} (pos:{}) from matchable to processable",
+                                            new Object[]{i,ct.token.getSpan(),ct.token.getAnnotations(POS_ANNOTATION)});
+                                        ct.isLinkable = true;
+                                        if(!hasLinkableToken){
+                                            hasLinkableToken = true;
+                                        }
+                                    }
+                                    i--;//mark both (ct & pt) as processed
+                                }
+                            }
+                        }
+                        activeChunk = null;
+                    }
+                }
+            }
+        }
+    }
+    
+    public List<TokenData> getTokens() {
+        return tokens;
+    }
+
+    public boolean hasLinkableToken() {
+        return hasLinkableToken;
+    }
+}
\ No newline at end of file

Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java?rev=1516775&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TokenData.java Fri Aug 23 09:22:53 2013
@@ -0,0 +1,244 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static java.util.Collections.disjoint;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+/**
+ * Internally used to store additional Metadata for Tokens of the current Sentence
+ * <p>
+ * Checks if the parsed {@link Token} is processable. This decision is taken first based on the POS
+ * annotation ( Lexical Category, POS tag) and second on the
+ * {@link EntityLinkerConfig#getMinSearchTokenLength()} if no POS annotations are available or the
+ * probability of the POS annotations is to low.
+ * <p>
+ * Since STANBOL-685two POS Probabilities are used <ul>
+ * <li> {@link LanguageProcessingConfig#getMinPosAnnotationProbability()} for accepting POS tags that are
+ * processed - included in {@link LanguageProcessingConfig#getLinkedLexicalCategories()} or
+ * {@link LanguageProcessingConfig#getLinkedPosTags()}.
+ * <li> {@link LanguageProcessingConfig#getMinExcludePosAnnotationProbability()} for those that are not
+ * processed. By default the exclusion probability is set to half of the inclusion one.
+ * </ul>
+ * Assuming that the <code>minPosTypePropb=0.667</code> a
+ * <ul>
+ * <li>noun with the prop 0.8 would result in returning <code>true</code>
+ * <li>noun with prop 0.5 would return <code>null</code>
+ * <li>verb with prop 0.4 would return <code>false</code>
+ * <li>verb with prop 0.3 would return <code>null</code>
+ * </ul>
+ * This algorithm makes it less likely that the {@link EntityLinkerConfig#getMinSearchTokenLength()} needs
+ * to be used as fallback for Tokens (what typically still provides better estimations as the token
+ * length).
+ * <p>
+ * (see also STANBOL-685 even that this Issue refers a version of this Engine that has not yet used the
+ * Stanbol NLP processing chain)
+ * 
+ * @param token
+ *            the {@link Token} to check.
+ * @return <code>true</code> if the parsed token needs to be processed. Otherwise <code>false</code>
+ */
+public class TokenData {
+    /** The Token */
+    public final Token token;
+    /** The index of the Token within the current Section (Sentence) */
+    public final int index;
+    /** If this Token should be linked with the Vocabulary */
+    public boolean isLinkable;
+    /** If this Token should be used for multi word searches in the Vocabulary */
+    public boolean isMatchable;
+    /** if this Token has an alpha or numeric char */
+    public final boolean hasAlphaNumeric;
+    /** the chunk of this Token */
+    public final ChunkData inChunk;
+    /** the morphological features of the Token (selected based on the POS Tag) */
+    public final MorphoFeatures morpho;
+    /**
+     * if this token starts with an upperCase letter
+     */
+    public final boolean upperCase;
+    /**
+     * if the length of the token is &gt;= {@link LanguageProcessingConfig#getMinSearchTokenLength()}
+     */
+    public boolean hasSearchableLength;
+    /**
+     * If the POS type of this word matches a linkable category
+     */
+    public final Boolean isLinkablePos;
+    /**
+     * if the POS type of this word matches a matchable category
+     */
+    public final Boolean isMatchablePos;
+    /**
+     * if this Token represents the start of an sub-sentence such as an 
+     * starting ending quote 
+     * @see ProcessingState#SUB_SENTENCE_START_POS
+     */
+    public final boolean isSubSentenceStart;
+    /**
+     * Constructs and initializes meta data needed for linking based 
+     * on the current tokens (and its NLP annotation)
+     * @param index the index of the Token within the current section
+     * @param token the token
+     * @param chunk the current chunk or <code>null</code> if none
+     */
+    public TokenData(LanguageProcessingConfig tpc, int index,Token token, ChunkData chunk) {
+        //(0) init fields
+        this.token = token;
+        this.index = index;
+        this.inChunk = chunk;
+        this.hasAlphaNumeric = Utils.hasAlphaNumericChar(token.getSpan());
+        this.hasSearchableLength = token.getSpan().length() >= tpc.getMinSearchTokenLength();
+        PosTag selectedPosTag = null;
+        boolean matchedPosTag = false; //matched any of the POS annotations
+        
+        //(1) check if this Token should be linked against the Vocabulary (isProcessable)
+        upperCase = token.getEnd() > token.getStart() && //not an empty token
+                Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case
+        boolean isLinkablePos = false;
+        boolean isMatchablePos = false;
+        boolean isSubSentenceStart = false;
+        List<Value<PosTag>> posAnnotations = token.getAnnotations(POS_ANNOTATION);
+        for(Value<PosTag> posAnnotation : posAnnotations){
+            // check three possible match
+            //  1. the LexicalCategory matches
+            //  2. the Pos matches
+            //  3. the String tag matches
+            PosTag posTag = posAnnotation.value();
+            if((!disjoint(tpc.getLinkedLexicalCategories(), posTag.getCategories())) ||
+                    (!disjoint(tpc.getLinkedPos(), posTag.getPosHierarchy())) ||
+                    tpc.getLinkedPosTags().contains(posTag.getTag())){
+                if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                        posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+                    selectedPosTag = posTag;
+                    isLinkablePos = true;
+                    isMatchablePos = true;
+                    matchedPosTag = true;
+                    break;
+                } // else probability to low for inclusion
+            } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                    posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+                selectedPosTag = posTag; //also rejected PosTags are selected
+                matchedPosTag = true;
+                isLinkablePos = false;
+                break;
+            } // else probability to low for exclusion
+        }
+        if(!matchedPosTag) { //not matched against a POS Tag ...
+            this.isLinkablePos = null;
+        } else {
+            this.isLinkablePos = isLinkablePos;
+        }
+        
+        //(2) check if this token should be considered to match labels of suggestions
+        if(this.isLinkablePos != null && this.isLinkablePos){ //processable tokens are also matchable
+            this.isMatchablePos = true;
+        } else { //check POS and length to see if token is matchable
+            matchedPosTag = false; //reset to false!
+            for(Value<PosTag> posAnnotation : posAnnotations){
+                PosTag posTag = posAnnotation.value();
+                if(posTag.isMapped()){
+                    if(!Collections.disjoint(tpc.getMatchedLexicalCategories(), 
+                        posTag.getCategories())){
+                        if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                                posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+                            //override selectedPosTag if present
+                            selectedPosTag = posTag; //mark the matchable as selected PosTag
+                            isMatchablePos = true;
+                            matchedPosTag = true;
+                            break;
+                        } // else probability to low for inclusion
+                    } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                            posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+                        if(selectedPosTag == null){ //do not override existing values
+                            selectedPosTag = posTag; //also rejected PosTags are selected
+                        }
+                        isMatchablePos = false;
+                        matchedPosTag = true;
+                        break;
+                    } // else probability to low for exclusion
+                } //else not matched ... search next one
+            }
+            if(!matchedPosTag){ //not matched against POS tag ...
+                //fall back to the token length
+                this.isMatchablePos = null;
+                //this.isMatchablePos = token.getSpan().length() >= tpc.getMinSearchTokenLength();    
+            } else {
+                this.isMatchablePos = isMatchablePos;
+            }
+        }
+        //(3) check if the POS tag indicates the start/end of an sub-sentence
+        for(Value<PosTag> posAnnotation : posAnnotations){
+            PosTag posTag = posAnnotation.value();
+            if((!disjoint(ProcessingState.SUB_SENTENCE_START_POS,posTag.getPosHierarchy()))){
+                if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                        posAnnotation.probability() >= tpc.getMinPosAnnotationProbability()){
+                    isSubSentenceStart = true;
+                } // else probability to low for inclusion
+            } else if(posAnnotation.probability() == Value.UNKNOWN_PROBABILITY ||
+                    posAnnotation.probability() >= tpc.getMinExcludePosAnnotationProbability()){
+                isSubSentenceStart = false;
+            }
+        }
+        this.isSubSentenceStart = isSubSentenceStart;
+        
+        //(4) check for morpho analyses
+        if(selectedPosTag == null){ //token is not processable or matchable
+            //we need to set the selectedPoas tag to the first POS annotation
+            Value<PosTag> posAnnotation = token.getAnnotation(POS_ANNOTATION);
+            if(posAnnotation != null) {
+                selectedPosTag = posAnnotation.value();
+            }
+        }
+        List<Value<MorphoFeatures>> morphoAnnotations = token.getAnnotations(NlpAnnotations.MORPHO_ANNOTATION);
+        if(selectedPosTag == null){ //no POS information ... use the first morpho annotation
+            morpho = morphoAnnotations.isEmpty() ? null : morphoAnnotations.get(0).value();
+        } else { //select the correct morpho annotation based on the POS tag
+            MorphoFeatures mf = null;
+            selectMorphoFeature : 
+            for(Value<MorphoFeatures> morphoAnnotation : morphoAnnotations){
+                for(PosTag posTag : morphoAnnotation.value().getPosList()){
+                    if(!disjoint(selectedPosTag.getCategories(),posTag.getCategories())){
+                        mf = morphoAnnotation.value();
+                        break selectMorphoFeature; //stop after finding the first one
+                    }
+                }
+            }
+            morpho = mf;
+        }
+        
+    }
+    
+    /**
+     * Getter for token text
+     * @return the text of the token
+     */
+    public String getTokenText(){
+        return token.getSpan();
+    }
+    /**
+     * Getter for the Lemma of the token. 
+     * @return the Lemma of the Token or <code>null</code> if not available
+     */
+    public String getTokenLemma(){
+        return morpho != null ? morpho.getLemma() : null;
+    }
+    @Override
+    public String toString() {
+        return new StringBuilder("TokenData: '").append(getTokenText())
+                .append("'[linkable=").append(isLinkable).append("(linkabkePos=").append(isLinkablePos)
+                .append(")| matchable=").append(isMatchable).append("(matchablePos=").append(isMatchablePos)
+                .append(")| alpha=").append(hasAlphaNumeric).append("| seachLength=")
+                .append(hasSearchableLength).append("| upperCase=").append(upperCase)
+                .append("]").toString();
+    }  
+}
\ No newline at end of file