You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:40:11 UTC
svn commit: r1413155 [2/4] - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking: ./ src/ src/license/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/stanbol/ src/main/java/org/apac...

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,516 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.config;
+
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+public class LanguageProcessingConfig implements Cloneable{
+    
+    /**
+     * The linked Phrase types. Includes {@link LexicalCategory#Noun} phrases
+     */
+    public static final Set<LexicalCategory> DEFAULT_PROCESSED_PHRASE_CATEGORIES = 
+            EnumSet.of(LexicalCategory.Noun);
+    /**
+     * The default set of {@link LexicalCategory LexicalCategories} used to
+     * lookup (link) Entities within the {@link EntitySearcher}
+     */
+    public static final Set<LexicalCategory> DEFAULT_LINKED_LEXICAL_CATEGORIES = 
+            EnumSet.of(LexicalCategory.Noun, LexicalCategory.Residual);
+
+    /**
+     * The default set of {@link LexicalCategory LexicalCategories} used to
+     * match (and search) for Entities.<p>
+     * Matched Tokens are not used for linking, but are considered when matching
+     * label tokens of Entities with the Text.
+     */
+    public static final Set<LexicalCategory> DEFAULT_MATCHED_LEXICAL_CATEGORIES =
+            EnumSet.of(LexicalCategory.Noun, LexicalCategory.Quantifier,LexicalCategory.Residual);
+    
+    /**
+     * The default set of {@link Pos} types that are used to lookup (link) Entities.
+     * By defualt only {@link Pos#ProperNoun}s and two 
+     * {@link LexicalCategory#Residual} acronyms and
+     * words marked as foreign material.
+     */
+    public static final Set<Pos> DEFAULT_LINKED_POS = 
+            EnumSet.of(Pos.ProperNoun, Pos.Foreign, Pos.Acronym);
+
+    /**
+     * Default value for POS annotation confidence required for processed POS tags.
+     * Used for <ul>
+     * <li> {@link #getLinkedLexicalCategories()}
+     * <li> {@link #getLinkedPosTags()} and
+     * <li> {@link #getMatchedLexicalCategories()}
+     * <ul>
+     */
+    public static final double DEFAULT_MIN_POS_ANNOTATION_PROBABILITY = 0.75;
+
+    /**
+     * Default value for POS annotation confidence required for not-processed POS tags
+     * (not contained in both {@link #getLinkedLexicalCategories()} and 
+     * {@link #getLinkedPosTags()}). <br> The default is 
+     * <code>{@link #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}/2</code>
+     */
+    public static final double DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY/2;
+
+    /**
+     * By default {@link Chunk}s are considered
+     */
+    public static final boolean DEFAULT_IGNORE_CHUNK_STATE = false;
+    /**
+     * the minimum probability so that a phrase in processed based on the Phrase Annotation
+     */
+    public static final double DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY = 0.75;
+    /**
+     * the minimum probability so that a phrase is rejected based on the Phrase Annotation
+     */
+    public static final double DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY = 
+            DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY/2;
+    /**
+     * The default for linking upper case tokens (regardless of length and POS)
+     * The default is <code>false</code> as some languages (like German) use upper
+     * case for Nouns and so this would also affect configurations that only
+     * link {@link Pos#ProperNoun}s
+     */
+    public static final boolean DEFAULT_LINK_UPPER_CASE_TOKEN_STATE = false;
+    /**
+     * The default for matching upper case tokens (regardless of length and POS)
+     * is <code>true</code>
+     */
+    public static final boolean DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE = true;
+    /**
+     * By default linking of chunks with multiple matchable tokens is enabled.
+     * This is useful to link Entities represented by two common nouns.  
+     */
+    public static final boolean DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE = true;
+    
+    /**
+     * The set of {@link PosTag#getCategory()} considered for EntityLinking
+     * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
+     */
+    private Set<LexicalCategory> linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;
+
+    private Set<LexicalCategory> matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
+
+    /**
+     * The linked {@link Pos} categories
+     */
+    private Set<Pos> linkedPos = DEFAULT_LINKED_POS;
+    /**
+     * The set of {@link PosTag#getTag()} values that are processed
+     */
+    private Set<String> linkedPosTags = Collections.emptySet();
+    /**
+     * The minimum confidence of POS annotations for {@link #getLinkedLexicalCategories()}
+     * and {@link #getLinkedPosTags()}
+     */
+    private double minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;
+
+    /**
+     * The minimum confidence that a POS annotation 
+     */
+    private double minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY/2;
+
+    private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
+
+
+    private double minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
+
+    private double minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;
+
+    private Set<LexicalCategory> processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;
+
+    private Set<String> processedPhraseTags = Collections.emptySet();
+    /**
+     * If upper case tokens are linked (and matched)
+     */
+    private boolean linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
+    /**
+     * If upper case tokens are matched
+     */
+    private boolean matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
+    /**
+     * If for {@link Chunk}s with multiple matchable Tokens those should be
+     * linked.
+     */
+    private boolean linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
+
+
+    /**
+     * The language or <code>null</code> for the default configuration
+     * @param language
+     */
+    public LanguageProcessingConfig(){
+    }    
+    
+    public final boolean isIgnoreChunks() {
+        return ignoreChunksState;
+    }    
+    
+    /**
+     * Setter for the ignore {@link Chunk} state.
+     * @param state the state or <code>null</code> to set the 
+     * {@link #DEFAULT_IGNORE_CHUNK_STATE}
+     */
+    public final void setIgnoreChunksState(Boolean state){
+        if(state == null){
+            this.ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
+        } else {
+            this.ignoreChunksState = state;
+        }
+    }
+    
+    /**
+     * Getter for the set of {@link LexicalCategory LexicalCategories} used 
+     * to link Entities in the configured Vocabulary.
+     * @return the set of {@link LexicalCategory LexicalCategories} used 
+     * for linking.
+     * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
+     */
+    public final Set<LexicalCategory> getLinkedLexicalCategories() {
+        return linkedLexicalCategories;
+    }
+    /**
+     * Getter for the set of {@link LexicalCategory LexicalCategories} used
+     * to match label tokens of suggested Entities.
+     * @return the set of {@link LexicalCategory LexicalCategories} used for
+     * matching
+     */
+    public final Set<LexicalCategory> getMatchedLexicalCategories(){
+        return matchedLexicalCategories;
+    }
+    /**
+     * Setter for the matched lexical categories
+     * @param matchedLexicalCategories the set or <code>null</code>
+     * to set the {@link #DEFAULT_MATCHED_LEXICAL_CATEGORIES}
+     */
+    public void setMatchedLexicalCategories(Set<LexicalCategory> matchedLexicalCategories) {
+        if(matchedLexicalCategories == null){
+            this.matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
+        } else {
+            this.matchedLexicalCategories = EnumSet.noneOf(LexicalCategory.class);
+            this.matchedLexicalCategories.addAll(matchedLexicalCategories);
+        }
+    }
+    /**
+     * The set of tags used for linking. This is useful if the string tags
+     * used by the POS tagger are not mapped to {@link LexicalCategory} nor
+     * {@link Pos} enum members. 
+     * @return the set of pos tags used for linking entities
+     */
+    public final Set<String> getLinkedPosTags() {
+        return linkedPosTags;
+    }
+    
+    /**
+     * Getter for the minimum probability of POS annotations for 
+     * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+     * @return the probability
+     */
+    public final double getMinPosAnnotationProbability() {
+        return minPosAnnotationProbability ;
+    }
+    
+    
+    /**
+     * Getter for the minimum probability of POS annotations not included in 
+     * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+     * @return the probability
+     */
+    public final double getMinExcludePosAnnotationProbability() {
+        return minExcludePosAnnotationProbability;
+    }
+    
+    /**
+     * Setter for the minimum probability of POS annotations for 
+     * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+     * @param minPosAnnotationProbability the probability or <code>null</code> to set
+     * {@value #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}
+     */
+    public final void setMinPosAnnotationProbability(Double minPosAnnotationProbability) {
+        if(minPosAnnotationProbability == null){
+            this.minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;
+        } else if(minPosAnnotationProbability >= 0 && minPosAnnotationProbability <= 1) {
+            this.minPosAnnotationProbability = minPosAnnotationProbability;
+        } else {
+            throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
+        }
+    }
+    
+    /**
+     * Setter for the minimum probability of POS annotations not included in 
+     * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+     * @param minExcludePosAnnotationProbability the probability or <code>null</code> to set
+     * {@value #DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY}
+     */
+    public final void setMinExcludePosAnnotationProbability(Double minExcludePosAnnotationProbability){
+        if(minExcludePosAnnotationProbability == null){
+            this.minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;
+        } else if(minExcludePosAnnotationProbability >= 0 && minExcludePosAnnotationProbability <= 1) {
+            this.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
+        } else {
+            throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
+        }
+    }
+    /**
+     * Setter for the linked {@link LexicalCategory LexicalCategories}
+     * @param linkedLexicalCategories the set or <code>null</code> to set
+     * the {@link #DEFAULT_LINKED_LEXICAL_CATEGORIES}.
+     */
+    public final void setLinkedLexicalCategories(Set<LexicalCategory> linkedLexicalCategories) {
+        if(linkedLexicalCategories == null){
+            this.linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;
+        } else if(linkedLexicalCategories.contains(null)){
+            throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
+        } else {
+            this.linkedLexicalCategories = linkedLexicalCategories;
+        }
+    }
+    /**
+     * Setter for the linked {@link Pos} types.
+     * @param linkedLexicalCategories the set of linked {@link Pos} types or <code>null</code>
+     * to set the {@link #DEFAULT_LINKED_POS} types
+     */
+    public final void setLinkedPos(Set<Pos> linkedPos) {
+        if(linkedPos == null){
+            this.linkedPos = DEFAULT_LINKED_POS;
+        } else if(linkedPos.contains(null)){
+            throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
+        } else {
+            this.linkedPos = linkedPos;
+        }
+    }
+    /**
+     * Setter for the linked Pos Tags. This should only be used of the 
+     * used POS tagger uses {@link PosTag}s that are not mapped to
+     * {@link LexicalCategory LexicalCategories} nor {@link Pos} types.
+     * @param processedPosTags the linked Pos tags. if <code>null</code>
+     * the value is set to an empty set.
+     */
+    public final void setLinkedPosTags(Set<String> processedPosTags) {
+        if(processedPosTags == null){
+            this.linkedPosTags = Collections.emptySet();
+        } else if(processedPosTags.contains(null)){
+            throw new IllegalArgumentException("The parsed set with processed POS tags MUST NOT contain the NULL element!");
+        } else {
+            this.linkedPosTags = processedPosTags;
+        }
+    }
+    /**
+     * Getter for the processed phrase categories.
+     * {@link Chunk}s of other types will be ignored.
+     * @return
+     */
+    public Set<LexicalCategory> getProcessedPhraseCategories() {
+        return processedPhraseCategories;
+    }
+    /**
+     * Setter for the processable phrase categories. 
+     * @param processablePhraseCategories the processable categories or
+     * <code>null</code> to set the {@link #DEFAULT_PROCESSED_PHRASE_CATEGORIES}.
+     */
+    public void setProcessedPhraseCategories(Set<LexicalCategory> processablePhraseCategories){
+        if(processablePhraseCategories == null){
+            this.processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;
+        } else {
+            this.processedPhraseCategories = EnumSet.noneOf(LexicalCategory.class);
+            this.processedPhraseCategories.addAll(processablePhraseCategories);
+        }
+    }
+    /**
+     * Getter for the prococessed phrase Tags. This should be only
+     * used if the {@link PhraseTag}s used by the Chunker are not
+     * mapped to {@link LexicalCategory LexicalCategories}.
+     * @return the processed phrase tags
+     */
+    public Set<String> getProcessedPhraseTags() {
+        return processedPhraseTags;
+    }
+    /**
+     * Setter for the Processed Phrase Tags
+     * @param processedPhraseTags the set with the tags. If <code>null</code>
+     * the value is set to an empty set.
+     */
+    public void setProcessedPhraseTags(Set<String> processedPhraseTags) {
+        if(processedPhraseTags == null || processedPhraseTags.isEmpty()){
+            this.processedPhraseTags = Collections.emptySet();
+        } else {
+            this.processedPhraseTags = new HashSet<String>(processedPhraseTags);
+        }
+    }
+    /**
+     * Getter for the minimum required probability so that {@link PhraseTag}s
+     * are accepted.
+     * @return the probability [0..1)
+     */
+    public double getMinPhraseAnnotationProbability() {
+        return minPhraseAnnotationProbability;
+    }
+    /**
+     * Getter for the minimum required probability so that {@link PhraseTag}s
+     * are considered for rejecting (e.g. to skip a VerbPhrase if 
+     * {@link LexicalCategory#Verb} is not present in 
+     * {@link #getProcessedPhraseCategories()}). Typically this value is
+     * lower as {@link #getMinPhraseAnnotationProbability()}
+     * @return the probability [0..1)
+     */
+    public double getMinExcludePhraseAnnotationProbability() {
+        return minExcludePhraseAnnotationProbability;
+    }
+    /**
+     * Setter for the minimum phrase annotation probability [0..1)
+     * @param prob the probability [0..1) or <code>null</code> to set
+     * the {@value #DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY}
+     * @throws IllegalArgumentException if the parsed value is not
+     * in the range [0..1).
+     */
+    public void setMinPhraseAnnotationProbability(Double prob) {
+        if(prob == null){
+            this.minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
+        } else if (prob >= 1 || prob < 0){
+            throw new IllegalArgumentException("The parsed minimum phrase annotation probability '"
+                + prob +" MUST be in the range [0..1)!");
+        } else {
+            this.minPhraseAnnotationProbability = prob;
+        }
+    }
+
+    /**
+     * Setter for the minimum excluded phrase annotation probability [0..1)
+     * @param prob the probability [0..1) or <code>null</code> to set
+     * the {@value #DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY}
+     * @throws IllegalArgumentException if the parsed value is not
+     * in the range [0..1).
+     */
+    public void setMinExcludePhraseAnnotationProbability(Double prob) {
+        if(prob == null){
+            this.minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;
+        } else if (prob >= 1 || prob < 0){
+            throw new IllegalArgumentException("The parsed minimum exclude phrase annotation probability '"
+                + prob +" MUST be in the range [0..1)!");
+        } else {
+            this.minExcludePhraseAnnotationProbability = prob;
+        }
+    }
+    /**
+     * Getter for the set of {@link Pos} types used for linking Entities
+     * @return the linked {@link Pos} types
+     */
+    public Set<Pos> getLinkedPos() {
+        return linkedPos;
+    }
+    
+    /**
+     * If upper case Tokens should be linked regardless
+     * of the POS type and length
+     * @return
+     */
+    public boolean isLinkUpperCaseTokens(){
+        return linkUpperCaseTokensState;
+    }
+    /**
+     * Setter for the state if upper case token should be
+     * linked regardless of the POS type and length
+     * @param linkUpperCaseTokensState the state or <code>null</code>
+     * to set the {@link #DEFAULT_LINK_UPPER_CASE_TOKEN_STATE}
+     */
+    public void setLinkUpperCaseTokensState(Boolean linkUpperCaseTokensState) {
+        if(linkUpperCaseTokensState == null){
+            this.linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
+        } else {
+            this.linkUpperCaseTokensState = linkUpperCaseTokensState;
+        }
+    }
+    /**
+     * If upper case Tokens should be matched regardless
+     * of the POS type and length
+     * @return
+     */
+    public boolean isMatchUpperCaseTokens(){
+        return matchUpperCaseTokensState;
+    }
+    /**
+     * Setter for the state if upper case token should be
+     * matched regardless of the POS type and length
+     * @param matchUpperCaseTokensState the state or <code>null</code>
+     * to set the {@link #DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE}
+     */
+    public void setMatchUpperCaseTokensState(Boolean matchUpperCaseTokensState) {
+        if(matchUpperCaseTokensState == null){
+            this.matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
+        } else {
+            this.matchUpperCaseTokensState = matchUpperCaseTokensState;
+        }
+    }
+    /**
+     * If {@link #isIgnoreChunks()} is disabled than this allows
+     * to convert matchable {@link Token}s to linked one in 
+     * case a {@link Chunk} contains more than one matchable
+     * Token. <p>
+     * This is especially useful in cases where only
+     * {@link Pos#ProperNoun}s are processed to also detect
+     * Entities that are named by using multiple Common Nouns.
+     * In cases where all {@link LexicalCategory#Noun}s are
+     * processed this option has usually no influence on the
+     * results.
+     * @return the state
+     */
+    public boolean isLinkMultiMatchableTokensInChunk() {
+        return linkMultiMatchableTokensInChunkState;
+    }
+    /**
+     * Setter for state if for {@link Chunk}s with multiple 
+     * matchable {@link Token}s those Tokens should be treated
+     * as linkable.<p>
+     * This is especially useful in cases where only
+     * {@link Pos#ProperNoun}s are linked to also detect
+     * Entities that are named by using multiple Common Nouns.
+     * In cases where all {@link LexicalCategory#Noun}s are
+     * processed this option has usually no influence on the
+     * results.
+     * @param state the state or <code>null</code> to reset to the
+     * the {@link #DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE default}
+     */
+    public void setLinkMultiMatchableTokensInChunkState(Boolean state){
+        if(state == null){
+            this.linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
+        } else {
+            this.linkMultiMatchableTokensInChunkState = state;
+        }
+    }
+    /**
+     * Clones the {@link LanguageProcessingConfig}. Intended to be used
+     * to create language specific configs based on the default one.
+     */
+    @Override
+    public LanguageProcessingConfig clone() {
+        LanguageProcessingConfig c = new LanguageProcessingConfig();
+        c.ignoreChunksState = ignoreChunksState;
+        c.minExcludePhraseAnnotationProbability = minExcludePhraseAnnotationProbability;
+        c.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
+        c.minPhraseAnnotationProbability = minPhraseAnnotationProbability;
+        c.minPosAnnotationProbability = minPosAnnotationProbability;
+        c.linkedLexicalCategories = linkedLexicalCategories;
+        c.processedPhraseCategories = processedPhraseCategories;
+        c.processedPhraseTags = processedPhraseTags;
+        c.linkedPos = linkedPos;
+        c.linkedPosTags = linkedPosTags;
+        c.linkUpperCaseTokensState = linkUpperCaseTokensState;
+        c.matchUpperCaseTokensState = matchUpperCaseTokensState;
+        c.linkMultiMatchableTokensInChunkState = linkMultiMatchableTokensInChunkState;
+        c.matchedLexicalCategories = matchedLexicalCategories;
+        return c;
+    }
+
+
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,415 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.config;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TextProcessingConfig {
+
+    private static final Logger log = LoggerFactory.getLogger(TextProcessingConfig.class);
+    /**
+     * If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link Pos#Acronym} are Matched. If
+     * deactivated all Tokens with the category {@link LexicalCategory#Noun} and 
+     * {@link LexicalCategory#Residual} are considered for matching.<p>
+     * This property allows an easy configuration of the matching that is sufficient for most usage scenarios.
+     * Users that need to have more control can configure language specific mappings by using
+     * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+     * {@link #PARAM_POS_PROBABILITY} in combination with the {@link #PROCESSED_LANGUAGES}
+     * configuration.<p>
+     * The {@link #DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE default} if this is <code>false</code>
+     */
+    public static final String PROCESS_ONLY_PROPER_NOUNS_STATE = "enhancer.engines.linking.properNounsState";
+    /**
+     * Default for the {@link #PROCESS_ONLY_PROPER_NOUNS_STATE} (false)
+     */
+    public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE = false;
+    /**
+     * Allows to configure the processed languages by using the syntax supported by {@link LanguageConfiguration}.
+     * In addition this engine supports language specific configurations for matched {@link LexicalCategory}
+     * {@link Pos} and String POS tags as well as Pos annotation probabilities by using the parameters
+     * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+     * {@link #PARAM_POS_PROBABILITY}.<p>
+     * See the documentation of {@link LanguageConfiguration} for details of the Syntax.
+     */
+    public static final String PROCESSED_LANGUAGES = "enhancer.engines.linking.processedLanguages";
+    /*
+     * Parameters used for language specific text processing configurations
+     */
+    // (1) PHRASE level
+    /**
+     * Allows to configure the processed Chunk type (the default is
+     * <code>cc={@link LexicalCategory#Noun Noun}</code> to process only
+     * Noun Phrases). If set to <code>cc</code> (empty value) processing
+     * of chunks is deactivated.
+     */
+    public static final String PARAM_PHRASE_CATEGORIES = "pc";
+    public static final String PARAM_PHRASE_TAG = "ptag";
+    public static final String PARAM_PHRASE_PROBABILITY = "pprob";
+    public static final String PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE = "lmmtip";
+    //(2) TOKEN level
+    public static final String PARAM_LEXICAL_CATEGORIES = "lc";
+    public static final String PARAM_POS_TYPES = "pos";
+    public static final String PARAM_POS_TAG = "tag";
+    public static final String PARAM_POS_PROBABILITY = "prob";
+    /**
+     * Parameter used to configure how to deal with upper case tokens
+     */
+    public static final String PARAM_UPPER_CASE = "uc";
+    /**
+     * Enumeration defining valued for the {@link EntityLinkingEngine#PARAM_UPPER_CASE} parameter
+     */
+    public static enum UPPER_CASE_MODE {NONE,MATCH,LINK};
+    /**
+     * The default state to dereference entities set to <code>true</code>.
+     */
+    public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
+    /**
+     * Default set of languages. This is an empty set indicating that texts in any
+     * language are processed. 
+     */
+    public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
+    public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
+    
+    /**
+     * The languages this engine is configured to enhance. An empty List is
+     * considered as active for any language
+     */
+    private LanguageConfiguration languages = new LanguageConfiguration(PROCESSED_LANGUAGES, new String[]{"*"});
+
+    private LanguageProcessingConfig defaultConfig;
+    private Map<String,LanguageProcessingConfig> languageConfigs = new HashMap<String,LanguageProcessingConfig>();
+
+    public TextProcessingConfig(){
+        this.defaultConfig = new LanguageProcessingConfig();
+    }
+    
+    public LanguageProcessingConfig getDefaults(){
+        return defaultConfig;
+    }
+    /**
+     * Getter for the language specific configuration.
+     * @param language
+     * @return the configuration sepcific to the parsed language or <code>null</code>
+     * if none.
+     */
+    public LanguageProcessingConfig getLanguageSpecificConfig(String language){
+        return languageConfigs.get(language);
+    }
+    /**
+     * Creates a language specific configuration by copying the currently configured
+     * defaults.
+     * @param language the language
+     * @return the specific configuration
+     * @throws IllegalStateException if a language specific configuration for the
+     * parsed language already exists.
+     */
+    public LanguageProcessingConfig createLanguageSpecificConfig(String language){
+        if(languageConfigs.containsKey(language)){
+            throw new IllegalStateException("A specific configuration for the language '"
+                +language+ "' does already exist!");
+        }
+        LanguageProcessingConfig conf = defaultConfig.clone();
+        languageConfigs.put(language, conf);
+        return conf;
+    }
+    /**
+     * Removes the language specific configuration for the parsed language
+     * @param language the language
+     * @return the removed configuration
+     */
+    public LanguageProcessingConfig removeLanguageSpecificConfig(String language){
+        return languageConfigs.remove(language);
+    }
+    
+    /**
+     * The {@link LanguageProcessingConfig} for the parsed language
+     * or <code>null</code> if the language is not included in the
+     * configuration. This will return the {@link #getDefaults()} if
+     * the parsed language does not have a specific configuration.<p>
+     * To obtain just language specific configuration use
+     * {@link #getLanguageSpecificConfig(String)}
+     * @param language the language
+     * @return the configuration or <code>null</code> if the language is
+     * not configured to be processed.
+     */
+    public LanguageProcessingConfig getConfiguration(String language) {
+        if(languages.isLanguage(language)){
+            LanguageProcessingConfig lpc = languageConfigs.get(language);
+            return lpc == null ? defaultConfig : lpc;
+        } else {
+            return null;
+        }
+    }
+    
+    
+    /**
+     * Initialise the {@link TextAnalyzer} component.<p>
+     * Currently this includes the following configurations: <ul>
+     * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
+     * default (process all languages) is used.
+     * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
+     * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
+     * languages based on the value of the
+     * 
+     * @param configuration the OSGI component configuration
+     */
+    public final static TextProcessingConfig createInstance(Dictionary<String,Object> configuration) throws ConfigurationException {
+        TextProcessingConfig tpc = new TextProcessingConfig();
+        //Parse the default text processing configuration
+        //set the default LexicalTypes
+        Object value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE);
+        boolean properNounState;
+        if(value instanceof Boolean){
+            properNounState = ((Boolean)value).booleanValue();
+        } else if (value != null){
+            properNounState = Boolean.parseBoolean(value.toString());
+        } else {
+            properNounState = DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
+        }
+        if(properNounState){
+            tpc.defaultConfig.setLinkedLexicalCategories(Collections.EMPTY_SET);
+            tpc.defaultConfig.setLinkedPos(LanguageProcessingConfig.DEFAULT_LINKED_POS);
+            log.debug("> ProperNoun matching activated (matched Pos: {})",
+                tpc.defaultConfig.getLinkedPos());
+        } else {
+            tpc.defaultConfig.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
+            tpc.defaultConfig.setLinkedPos(Collections.EMPTY_SET);
+            log.debug("> Noun matching activated (matched LexicalCategories: {})",
+                tpc.defaultConfig.getLinkedLexicalCategories());
+        }
+        //parse the language configuration
+        value = configuration.get(PROCESSED_LANGUAGES);
+        if(value instanceof String){
+            throw new ConfigurationException(PROCESSED_LANGUAGES, "Comma separated String "
+                + "is not supported for configurung the processed languages for the because "
+                + "the comma is used as separator for values of the parameters '"
+                + PARAM_LEXICAL_CATEGORIES+"', '"+ PARAM_POS_TYPES+"'and'"+PARAM_POS_TAG
+                + "! Users need to use String[] or Collection<?> instead!");
+        }
+        tpc.languages.setConfiguration(configuration);
+        Map<String,String> defaultConfig = tpc.languages.getDefaultParameters();
+        //apply the default parameters (parameter set for the '*' or '' (empty) language
+        if(!defaultConfig.isEmpty()){
+            applyLanguageParameter(tpc.defaultConfig,null,defaultConfig);
+        }
+        //apply language specific configurations
+        for(String lang : tpc.languages.getExplicitlyIncluded()){
+            LanguageProcessingConfig lpc = tpc.defaultConfig.clone();
+            applyLanguageParameter(lpc, lang, tpc.languages.getParameters(lang));
+            tpc.languageConfigs.put(lang, lpc);
+        }
+        return tpc;
+    }
+
+    private static void applyLanguageParameter(LanguageProcessingConfig tpc, String language, Map<String,String> config) throws ConfigurationException {
+        log.info(" > parse language Configuration for language: {}",
+            language == null ? "default":language);
+        //parse Phrase level configuration
+        Set<LexicalCategory> chunkCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_CATEGORIES, LexicalCategory.class);
+        Set<String> chunkTags = parseStringTags(config.get(PARAM_PHRASE_TAG));
+        if(chunkCats.isEmpty() && config.containsKey(PARAM_PHRASE_CATEGORIES) &&
+                chunkTags.isEmpty()){
+            log.info("   + enable ignorePhrase");
+            tpc.setIgnoreChunksState(true);
+            tpc.setProcessedPhraseCategories(Collections.EMPTY_SET);
+        } else {
+            tpc.setIgnoreChunksState(false);
+            if(!chunkCats.isEmpty()){
+                log.info("   + set processable Phrase cat {}",chunkCats);
+                tpc.setProcessedPhraseCategories(chunkCats);
+            } else {
+                log.info("   - use processable Phrase cats {}",tpc.getProcessedPhraseCategories());
+            }
+            if(!chunkTags.isEmpty()) {
+                log.info("   + set processable Phrase tags {}",chunkTags);
+                tpc.setProcessedPhraseTags(chunkTags);
+            } else {
+                log.info("   - use processable Phrase tags {}",tpc.getProcessedPhraseTags());
+            }
+        }
+        Double chunkProb = parseNumber(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_PROBABILITY, Double.class);
+        if(chunkProb != null || //if explicitly set
+                config.containsKey(PARAM_PHRASE_PROBABILITY)){ //set to empty value (set default)
+            log.info("   + set min ChunkTag probability: {}", chunkProb == null ? "default" : chunkProb);
+            tpc.setMinPhraseAnnotationProbability(chunkProb);
+            tpc.setMinExcludePhraseAnnotationProbability(chunkProb == null ? null : chunkProb/2);
+        } else {
+            log.info("   - use min PhraseTag probability: {}",tpc.getMinPhraseAnnotationProbability());
+        }
+        //link multiple matchable Tokens within Chunks
+        Boolean lmmticState = parseState(config, PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE);
+        if(lmmticState != null){
+            log.info("   + set the link multi matchable tokens in Phrase state to : {}",lmmticState);
+            tpc.setLinkMultiMatchableTokensInChunkState(lmmticState);
+        } else {
+            log.info("   - use the link multi matchable tokens in Phrase state to : {}",tpc.isLinkMultiMatchableTokensInChunk());
+        }
+        
+        //parse Token level configuration
+        Set<LexicalCategory> lexCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_LEXICAL_CATEGORIES, LexicalCategory.class);
+        Set<Pos> pos = parseEnumParam(config, PROCESSED_LANGUAGES, language,PARAM_POS_TYPES, Pos.class);
+        Set<String> tags = parseStringTags(config.get(PARAM_POS_TAG));
+        if(config.containsKey(PARAM_LEXICAL_CATEGORIES) ||
+                config.containsKey(PARAM_POS_TYPES) ||
+                config.containsKey(PARAM_POS_TAG)){
+            log.info("   + set Linkable Tokens: cat: {}, pos: {}, tags {}",
+                new Object[]{lexCats,pos,tags});
+            tpc.setLinkedLexicalCategories(lexCats);
+            tpc.setLinkedPos(pos);
+            tpc.setLinkedPosTags(tags);
+        } else {
+            log.info("   - use Linkable Tokens: cat: {}, pos: {}, tags {}",
+                new Object[]{tpc.getLinkedLexicalCategories(),
+                             tpc.getLinkedPos(),
+                             tpc.getLinkedPos()});
+        }
+        //min POS tag probability
+        Double prob = parseNumber(config,PROCESSED_LANGUAGES,language, PARAM_POS_PROBABILITY,Double.class);
+        if(prob != null || //explicitly set
+                config.containsKey(PARAM_POS_PROBABILITY)){ //set to empty value (set default)
+            log.info("   + set minimum POS tag probability: {}", prob == null ? "default" : prob);
+            tpc.setMinPosAnnotationProbability(prob);
+            tpc.setMinExcludePosAnnotationProbability(prob == null ? null : prob/2d);
+        } else {
+            log.info("   - use minimum POS tag probability: {}", tpc.getMinPosAnnotationProbability());
+        }
+        //parse upper case
+        Set<UPPER_CASE_MODE> ucMode = parseEnumParam(config, PROCESSED_LANGUAGES,language,PARAM_UPPER_CASE,UPPER_CASE_MODE.class);
+        if(ucMode.size() > 1){
+            throw new ConfigurationException(PROCESSED_LANGUAGES, "Parameter 'uc' (Upper case mode) MUST NOT be multi valued (langauge: "
+                +(language == null ? "default":language)+", parsed value='"+config.get(PARAM_UPPER_CASE)+"')!");
+        }
+        if(!ucMode.isEmpty()){
+            UPPER_CASE_MODE mode = ucMode.iterator().next();
+            log.info("   + set upper case token mode to {}", mode);
+            switch (mode) {
+                case NONE:
+                    tpc.setMatchUpperCaseTokensState(false);
+                    tpc.setLinkUpperCaseTokensState(false);
+                    break;
+                case MATCH:
+                    tpc.setMatchUpperCaseTokensState(true);
+                    tpc.setLinkUpperCaseTokensState(false);
+                    break;
+                case LINK:
+                    tpc.setMatchUpperCaseTokensState(true);
+                    tpc.setLinkUpperCaseTokensState(true);
+                    break;
+                default:
+                    log.warn("Unsupported {} entry {} -> set defaults",UPPER_CASE_MODE.class.getSimpleName(),mode);
+                    tpc.setMatchUpperCaseTokensState(null);
+                    tpc.setLinkUpperCaseTokensState(null);
+                    break;
+            }
+        } else {
+            log.info("   - use upper case token mode: match={}, link={}", tpc.isMatchUpperCaseTokens(), tpc.isLinkUpperCaseTokens());
+        }
+    }
+
+    private static Boolean parseState(Map<String,String> config, String param){
+        String value = config.get(param);
+        return value == null && config.containsKey(param) ? Boolean.TRUE :
+            value != null ? new Boolean(value) : null;
+    }
+    
+    private static <T extends Number> T parseNumber(Map<String,String> config, 
+            String property, String language, String param, Class<T> clazz) throws ConfigurationException {
+        String paramVal = config.get(PARAM_POS_PROBABILITY);
+        if(paramVal != null && !paramVal.trim().isEmpty()){
+            try {
+                //all Number subclasses do have a String constructor!
+                return clazz.getConstructor(String.class).newInstance(paramVal.trim());
+            } catch (NumberFormatException e) {
+                throw new ConfigurationException(property, "Unable to parse "
+                    + clazz.getSimpleName()+" from Parameter '"
+                    + PARAM_POS_PROBABILITY+"="+paramVal.trim()
+                    + "' from the "+(language == null ? "default" : language)
+                    + " language configuration", e);
+            } catch (IllegalArgumentException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            } catch (SecurityException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            } catch (InstantiationException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            } catch (IllegalAccessException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            } catch (InvocationTargetException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            } catch (NoSuchMethodException e) {
+                throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+                    +"("+paramVal.trim()+"::String)",e);
+            }
+        }
+        return null;
+    }
+    
+    private static Set<String> parseStringTags(String value) {
+        if(value == null || value.isEmpty()){
+            return Collections.emptySet();
+        } else {
+            Set<String> tags = new HashSet<String>();
+            for(String entry : value.split(",")){
+                entry = entry.trim();
+                if(!entry.isEmpty()){
+                    tags.add(entry);
+                }
+            }
+            return tags;
+        }
+    }
+
+    /**
+     * Utility to parse Enum members out of a comma separated string
+     * @param config the config
+     * @param property the property (only used for error handling)
+     * @param param the key of the config used to obtain the config
+     * @param enumClass the {@link Enum} class
+     * @return the configured members of the Enum or an empty set if none 
+     * @throws ConfigurationException if a configured value was not part of the enum
+     */
+    private static <T extends Enum<T>> Set<T> parseEnumParam(Map<String,String> config,
+        String property, String language, //params used for logging
+        String param,Class<T> enumClass) throws ConfigurationException {
+        Set<T> enumSet;
+        String val = config.get(param);
+        if(val == null){
+            enumSet = Collections.emptySet();
+        } else {
+            enumSet = EnumSet.noneOf(enumClass);
+            for(String entry : val.split(",")){
+                entry = entry.trim();
+                if(!entry.isEmpty()){
+                    try {
+                        enumSet.add(Enum.valueOf(enumClass,entry.toString()));
+                    } catch (IllegalArgumentException e) {
+                        throw new ConfigurationException(property, 
+                            "'"+entry +"' of param '"+param+"' for language '"
+                            + (language == null ? "default" : language)
+                            + "'is not a member of the enum "+ enumClass.getSimpleName()
+                            + "(configured : '"+val+"')!" ,e);
+                    }
+                }
+            }
+        }
+        return enumSet;
+    }
+    
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,332 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.engine;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.StringUtils;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * Engine that consumes NLP processing results from the {@link AnalysedText}
+ * content part of processed {@link ContentItem}s and links them with
+ * Entities as provided by the configured {@link EntitySearcher} instance.
+ * @author Rupert Westenthaler
+ *
+ */
+public class EntityLinkingEngine implements EnhancementEngine, ServiceProperties {
+
+    private final Logger log = LoggerFactory.getLogger(EntityLinkingEngine.class);
+    /**
+     * This is used to check the content type of parsed {@link ContentItem}s for
+     * plain text
+     */
+    protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+    /**
+     * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+     */
+    protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+    /**
+     * The default value for the Execution of this Engine.
+     * This Engine creates TextAnnotations that should not be processed by other Engines.
+     * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
+     * to ensure that other engines do not get confused
+     */
+    public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;
+    
+    /**
+     * The name of this engine
+     */
+    protected final String name;
+    /**
+     * The entitySearcher used for linking
+     */
+    protected final EntitySearcher entitySearcher;
+    /**
+     * configuration for entity linking
+     */
+    protected final EntityLinkerConfig linkerConfig;
+    /**
+     * The label tokenizer
+     */
+    protected final LabelTokenizer labelTokenizer;
+    /**
+     * The text processing configuration
+     */
+    protected final  TextProcessingConfig textProcessingConfig;
+    /**
+     * The literalFactory used to create typed literals
+     */
+    private LiteralFactory literalFactory = LiteralFactory.getInstance();
+    
+    /**
+     * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
+     * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
+     * offline or not.
+     * 
+     * @see #enableOfflineMode(OfflineMode)
+     * @see #disableOfflineMode(OfflineMode)
+     */
+    @org.apache.felix.scr.annotations.Reference(
+        cardinality = ReferenceCardinality.OPTIONAL_UNARY, 
+        policy = ReferencePolicy.DYNAMIC, 
+        bind = "enableOfflineMode", 
+        unbind = "disableOfflineMode", 
+        strategy = ReferenceStrategy.EVENT)
+    private OfflineMode offlineMode;
+
+    /**
+     * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
+     * 
+     * @param mode
+     */
+    protected final void enableOfflineMode(OfflineMode mode) {
+        this.offlineMode = mode;
+    }
+
+    /**
+     * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
+     * 
+     * @param mode
+     */
+    protected final void disableOfflineMode(OfflineMode mode) {
+        this.offlineMode = null;
+    }
+
+    /**
+     * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
+     * 
+     * @return the offline state
+     */
+    protected final boolean isOfflineMode() {
+        return offlineMode != null;
+    }
+    
+    /**
+     * Internal Constructor used by {@link #createInstance(EntitySearcher, LanguageProcessingConfig, EntityLinkerConfig)}
+     * @param entitySearcher The component used to lookup Entities
+     * @param textProcessingConfig The configuration on how to use the {@link AnalysedText} content part of
+     * processed {@link ContentItem}s
+     * @param linkingConfig the configuration for the EntityLinker
+     */
+    public EntityLinkingEngine(String name, EntitySearcher entitySearcher,TextProcessingConfig textProcessingConfig, 
+                                   EntityLinkerConfig linkingConfig, LabelTokenizer labelTokenizer){
+        if(name == null || name.isEmpty()){
+            throw new IllegalArgumentException("The parsed EnhancementEngine name MUST NOT be NULL!");
+        }
+        this.name = name;
+        this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
+        this.textProcessingConfig = textProcessingConfig;
+        this.entitySearcher = entitySearcher;
+        this.labelTokenizer = labelTokenizer;
+    }
+
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return Collections.unmodifiableMap(Collections.singletonMap(
+            ENHANCEMENT_ENGINE_ORDERING,
+            (Object) DEFAULT_ORDER));
+    }
+
+    @Override
+    public String getName() {
+        return name;
+    }
+    
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        log.info("canEnhancer {}",ci.getUri());
+        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+            log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!",
+                getClass().getSimpleName(),getName());
+            return CANNOT_ENHANCE;
+        }
+        String language = getLanguage(this, ci, false);
+        if(language == null || textProcessingConfig.getConfiguration(language) == null){
+            log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
+                new Object[]{ getName(), ci.getUri(), language});
+            return CANNOT_ENHANCE;
+        }
+        //we need a detected language, the AnalyzedText contentPart with
+        //Tokens.
+        AnalysedText at = getAnalysedText(this, ci, false);
+        return at != null && at.getTokens().hasNext() ?
+                ENHANCE_ASYNC : CANNOT_ENHANCE;
+    }
+
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        log.info(" enhance ci {}",ci.getUri());
+        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+            throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null);
+        }
+        AnalysedText at = getAnalysedText(this, ci, true);
+        log.info("  > AnalysedText {}",at);
+        String language = getLanguage(this, ci, true);
+        if(log.isDebugEnabled()){
+            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
+                new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
+        }
+        log.debug("  > Language {}",language);
+        LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
+        if(languageConfig == null){
+            throw new IllegalStateException("The language '"+language+"' is not configured "
+                    + "to be processed by this Engine. As this is already checked within the "
+                    + "canEnhance(..) method this may indicate an bug in the used "
+                    + "EnhanceemntJobManager implementation!");
+        }
+        EntityLinker entityLinker = new EntityLinker(at,language, 
+            languageConfig, entitySearcher, linkerConfig, labelTokenizer);
+        //process
+        entityLinker.process();
+        //write results (requires a write lock)
+        ci.getLock().writeLock().lock();
+        try {
+            writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
+    }
+
+    /**
+     * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
+     * extracted from the parsed ContentItem
+     * @param ci
+     * @param linkedEntities
+     * @param language
+     */
+    private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
+        Language languageObject = null;
+        if(language != null && !language.isEmpty()){
+            languageObject = new Language(language);
+        }
+        Set<UriRef> dereferencedEntitis = new HashSet<UriRef>();
+        MGraph metadata = ci.getMetadata();
+        for(LinkedEntity linkedEntity : linkedEntities){
+            Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
+            //first create the TextAnnotations for the Occurrences
+            for(Occurrence occurrence : linkedEntity.getOccurrences()){
+                UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                textAnnotations.add(textAnnotation);
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_START, 
+                    literalFactory.createTypedLiteral(occurrence.getStart())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_END, 
+                    literalFactory.createTypedLiteral(occurrence.getEnd())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_SELECTION_CONTEXT, 
+                    new PlainLiteralImpl(occurrence.getContext(),languageObject)));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_SELECTED_TEXT, 
+                    new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_CONFIDENCE, 
+                    literalFactory.createTypedLiteral(linkedEntity.getScore())));
+                for(UriRef dcType : linkedEntity.getTypes()){
+                    metadata.add(new TripleImpl(
+                        textAnnotation, Properties.DC_TYPE, dcType));
+                }
+            }
+            //now the EntityAnnotations for the Suggestions
+            for(Suggestion suggestion : linkedEntity.getSuggestions()){
+                UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
+                //should we use the label used for the match, or search the
+                //representation for the best label ... currently its the matched one
+                Text label = suggestion.getBestLabel(linkerConfig.getNameField(),language);
+                Representation rep = suggestion.getRepresentation();
+                UriRef uri = new UriRef(rep.getId());
+                metadata.add(new TripleImpl(entityAnnotation, 
+                    Properties.ENHANCER_ENTITY_LABEL, 
+                    label.getLanguage() == null ?
+                            new PlainLiteralImpl(label.getText()) :
+                                new PlainLiteralImpl(label.getText(),
+                                    new Language(label.getLanguage()))));
+                metadata.add(new TripleImpl(entityAnnotation,ENHANCER_ENTITY_REFERENCE,uri));
+                Iterator<Reference> suggestionTypes = rep.getReferences(linkerConfig.getTypeField());
+                while(suggestionTypes.hasNext()){
+                    metadata.add(new TripleImpl(entityAnnotation, 
+                        Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
+                }
+                metadata.add(new TripleImpl(entityAnnotation,
+                    Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
+                for(UriRef textAnnotation : textAnnotations){
+                    metadata.add(new TripleImpl(entityAnnotation, 
+                        Properties.DC_RELATION, textAnnotation));
+                }
+                //add origin information of the EntiySearcher
+                for(Entry<UriRef,Collection<Resource>> originInfo : entitySearcher.getOriginInformation().entrySet()){
+                    for(Resource value : originInfo.getValue()){
+                        metadata.add(new TripleImpl(entityAnnotation, 
+                            originInfo.getKey(),value));
+                    }
+                }
+                //in case dereferencing of Entities is enabled we need also to
+                //add the RDF data for entities
+                if(linkerConfig.isDereferenceEntitiesEnabled() &&
+                        dereferencedEntitis.add(uri)){ //not yet dereferenced
+                    metadata.addAll(
+                        RdfValueFactory.getInstance().toRdfRepresentation(
+                            suggestion.getRepresentation()).getRdfGraph());
+                }
+            }
+        }
+    }
+
+}