You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:40:11 UTC
svn commit: r1413155 [2/4] - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking: ./
src/ src/license/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/stanbol/
src/main/java/org/apac...
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,516 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.config;
+
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+
+public class LanguageProcessingConfig implements Cloneable{
+
+ /**
+ * The linked Phrase types. Includes {@link LexicalCategory#Noun} phrases
+ */
+ public static final Set<LexicalCategory> DEFAULT_PROCESSED_PHRASE_CATEGORIES =
+ EnumSet.of(LexicalCategory.Noun);
+ /**
+ * The default set of {@link LexicalCategory LexicalCategories} used to
+ * lookup (link) Entities within the {@link EntitySearcher}
+ */
+ public static final Set<LexicalCategory> DEFAULT_LINKED_LEXICAL_CATEGORIES =
+ EnumSet.of(LexicalCategory.Noun, LexicalCategory.Residual);
+
+ /**
+ * The default set of {@link LexicalCategory LexicalCategories} used to
+ * match (and search) for Entities.<p>
+ * Matched Tokens are not used for linking, but are considered when matching
+ * label tokens of Entities with the Text.
+ */
+ public static final Set<LexicalCategory> DEFAULT_MATCHED_LEXICAL_CATEGORIES =
+ EnumSet.of(LexicalCategory.Noun, LexicalCategory.Quantifier,LexicalCategory.Residual);
+
+ /**
+ * The default set of {@link Pos} types that are used to lookup (link) Entities.
+ * By defualt only {@link Pos#ProperNoun}s and two
+ * {@link LexicalCategory#Residual} acronyms and
+ * words marked as foreign material.
+ */
+ public static final Set<Pos> DEFAULT_LINKED_POS =
+ EnumSet.of(Pos.ProperNoun, Pos.Foreign, Pos.Acronym);
+
+ /**
+ * Default value for POS annotation confidence required for processed POS tags.
+ * Used for <ul>
+ * <li> {@link #getLinkedLexicalCategories()}
+ * <li> {@link #getLinkedPosTags()} and
+ * <li> {@link #getMatchedLexicalCategories()}
+ * <ul>
+ */
+ public static final double DEFAULT_MIN_POS_ANNOTATION_PROBABILITY = 0.75;
+
+ /**
+ * Default value for POS annotation confidence required for not-processed POS tags
+ * (not contained in both {@link #getLinkedLexicalCategories()} and
+ * {@link #getLinkedPosTags()}). <br> The default is
+ * <code>{@link #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}/2</code>
+ */
+ public static final double DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY/2;
+
+ /**
+ * By default {@link Chunk}s are considered
+ */
+ public static final boolean DEFAULT_IGNORE_CHUNK_STATE = false;
+ /**
+ * the minimum probability so that a phrase in processed based on the Phrase Annotation
+ */
+ public static final double DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY = 0.75;
+ /**
+ * the minimum probability so that a phrase is rejected based on the Phrase Annotation
+ */
+ public static final double DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY =
+ DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY/2;
+ /**
+ * The default for linking upper case tokens (regardless of length and POS)
+ * The default is <code>false</code> as some languages (like German) use upper
+ * case for Nouns and so this would also affect configurations that only
+ * link {@link Pos#ProperNoun}s
+ */
+ public static final boolean DEFAULT_LINK_UPPER_CASE_TOKEN_STATE = false;
+ /**
+ * The default for matching upper case tokens (regardless of length and POS)
+ * is <code>true</code>
+ */
+ public static final boolean DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE = true;
+ /**
+ * By default linking of chunks with multiple matchable tokens is enabled.
+ * This is useful to link Entities represented by two common nouns.
+ */
+ public static final boolean DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE = true;
+
+ /**
+ * The set of {@link PosTag#getCategory()} considered for EntityLinking
+ * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
+ */
+ private Set<LexicalCategory> linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;
+
+ private Set<LexicalCategory> matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
+
+ /**
+ * The linked {@link Pos} categories
+ */
+ private Set<Pos> linkedPos = DEFAULT_LINKED_POS;
+ /**
+ * The set of {@link PosTag#getTag()} values that are processed
+ */
+ private Set<String> linkedPosTags = Collections.emptySet();
+ /**
+ * The minimum confidence of POS annotations for {@link #getLinkedLexicalCategories()}
+ * and {@link #getLinkedPosTags()}
+ */
+ private double minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;
+
+ /**
+ * The minimum confidence that a POS annotation
+ */
+ private double minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY/2;
+
+ private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
+
+
+ private double minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
+
+ private double minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;
+
+ private Set<LexicalCategory> processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;
+
+ private Set<String> processedPhraseTags = Collections.emptySet();
+ /**
+ * If upper case tokens are linked (and matched)
+ */
+ private boolean linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
+ /**
+ * If upper case tokens are matched
+ */
+ private boolean matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
+ /**
+ * If for {@link Chunk}s with multiple matchable Tokens those should be
+ * linked.
+ */
+ private boolean linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
+
+
+ /**
+ * The language or <code>null</code> for the default configuration
+ * @param language
+ */
+ public LanguageProcessingConfig(){
+ }
+
+ public final boolean isIgnoreChunks() {
+ return ignoreChunksState;
+ }
+
+ /**
+ * Setter for the ignore {@link Chunk} state.
+ * @param state the state or <code>null</code> to set the
+ * {@link #DEFAULT_IGNORE_CHUNK_STATE}
+ */
+ public final void setIgnoreChunksState(Boolean state){
+ if(state == null){
+ this.ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
+ } else {
+ this.ignoreChunksState = state;
+ }
+ }
+
+ /**
+ * Getter for the set of {@link LexicalCategory LexicalCategories} used
+ * to link Entities in the configured Vocabulary.
+ * @return the set of {@link LexicalCategory LexicalCategories} used
+ * for linking.
+ * @see #DEFAULT_LINKED_LEXICAL_CATEGORIES
+ */
+ public final Set<LexicalCategory> getLinkedLexicalCategories() {
+ return linkedLexicalCategories;
+ }
+ /**
+ * Getter for the set of {@link LexicalCategory LexicalCategories} used
+ * to match label tokens of suggested Entities.
+ * @return the set of {@link LexicalCategory LexicalCategories} used for
+ * matching
+ */
+ public final Set<LexicalCategory> getMatchedLexicalCategories(){
+ return matchedLexicalCategories;
+ }
+ /**
+ * Setter for the matched lexical categories
+ * @param matchedLexicalCategories the set or <code>null</code>
+ * to set the {@link #DEFAULT_MATCHED_LEXICAL_CATEGORIES}
+ */
+ public void setMatchedLexicalCategories(Set<LexicalCategory> matchedLexicalCategories) {
+ if(matchedLexicalCategories == null){
+ this.matchedLexicalCategories = DEFAULT_MATCHED_LEXICAL_CATEGORIES;
+ } else {
+ this.matchedLexicalCategories = EnumSet.noneOf(LexicalCategory.class);
+ this.matchedLexicalCategories.addAll(matchedLexicalCategories);
+ }
+ }
+ /**
+ * The set of tags used for linking. This is useful if the string tags
+ * used by the POS tagger are not mapped to {@link LexicalCategory} nor
+ * {@link Pos} enum members.
+ * @return the set of pos tags used for linking entities
+ */
+ public final Set<String> getLinkedPosTags() {
+ return linkedPosTags;
+ }
+
+ /**
+ * Getter for the minimum probability of POS annotations for
+ * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+ * @return the probability
+ */
+ public final double getMinPosAnnotationProbability() {
+ return minPosAnnotationProbability ;
+ }
+
+
+ /**
+ * Getter for the minimum probability of POS annotations not included in
+ * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+ * @return the probability
+ */
+ public final double getMinExcludePosAnnotationProbability() {
+ return minExcludePosAnnotationProbability;
+ }
+
+ /**
+ * Setter for the minimum probability of POS annotations for
+ * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+ * @param minPosAnnotationProbability the probability or <code>null</code> to set
+ * {@value #DEFAULT_MIN_POS_ANNOTATION_PROBABILITY}
+ */
+ public final void setMinPosAnnotationProbability(Double minPosAnnotationProbability) {
+ if(minPosAnnotationProbability == null){
+ this.minPosAnnotationProbability = DEFAULT_MIN_POS_ANNOTATION_PROBABILITY;
+ } else if(minPosAnnotationProbability >= 0 && minPosAnnotationProbability <= 1) {
+ this.minPosAnnotationProbability = minPosAnnotationProbability;
+ } else {
+ throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
+ }
+ }
+
+ /**
+ * Setter for the minimum probability of POS annotations not included in
+ * {@link #getLinkedLexicalCategories()} or {@link #getLinkedPosTags()}
+ * @param minExcludePosAnnotationProbability the probability or <code>null</code> to set
+ * {@value #DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY}
+ */
+ public final void setMinExcludePosAnnotationProbability(Double minExcludePosAnnotationProbability){
+ if(minExcludePosAnnotationProbability == null){
+ this.minExcludePosAnnotationProbability = DEFAULT_MIN_EXCLUDE_POS_ANNOTATION_PROBABILITY;
+ } else if(minExcludePosAnnotationProbability >= 0 && minExcludePosAnnotationProbability <= 1) {
+ this.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
+ } else {
+ throw new IllegalArgumentException("parsed value MUST BE in the range 0..1 or NULL to set the default");
+ }
+ }
+ /**
+ * Setter for the linked {@link LexicalCategory LexicalCategories}
+ * @param linkedLexicalCategories the set or <code>null</code> to set
+ * the {@link #DEFAULT_LINKED_LEXICAL_CATEGORIES}.
+ */
+ public final void setLinkedLexicalCategories(Set<LexicalCategory> linkedLexicalCategories) {
+ if(linkedLexicalCategories == null){
+ this.linkedLexicalCategories = DEFAULT_LINKED_LEXICAL_CATEGORIES;
+ } else if(linkedLexicalCategories.contains(null)){
+ throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
+ } else {
+ this.linkedLexicalCategories = linkedLexicalCategories;
+ }
+ }
+ /**
+ * Setter for the linked {@link Pos} types.
+ * @param linkedLexicalCategories the set of linked {@link Pos} types or <code>null</code>
+ * to set the {@link #DEFAULT_LINKED_POS} types
+ */
+ public final void setLinkedPos(Set<Pos> linkedPos) {
+ if(linkedPos == null){
+ this.linkedPos = DEFAULT_LINKED_POS;
+ } else if(linkedPos.contains(null)){
+ throw new IllegalArgumentException("The parsed set with linked LexicalCategories MUST NOT contain the NULL element!");
+ } else {
+ this.linkedPos = linkedPos;
+ }
+ }
+ /**
+ * Setter for the linked Pos Tags. This should only be used of the
+ * used POS tagger uses {@link PosTag}s that are not mapped to
+ * {@link LexicalCategory LexicalCategories} nor {@link Pos} types.
+ * @param processedPosTags the linked Pos tags. if <code>null</code>
+ * the value is set to an empty set.
+ */
+ public final void setLinkedPosTags(Set<String> processedPosTags) {
+ if(processedPosTags == null){
+ this.linkedPosTags = Collections.emptySet();
+ } else if(processedPosTags.contains(null)){
+ throw new IllegalArgumentException("The parsed set with processed POS tags MUST NOT contain the NULL element!");
+ } else {
+ this.linkedPosTags = processedPosTags;
+ }
+ }
+ /**
+ * Getter for the processed phrase categories.
+ * {@link Chunk}s of other types will be ignored.
+ * @return
+ */
+ public Set<LexicalCategory> getProcessedPhraseCategories() {
+ return processedPhraseCategories;
+ }
+ /**
+ * Setter for the processable phrase categories.
+ * @param processablePhraseCategories the processable categories or
+ * <code>null</code> to set the {@link #DEFAULT_PROCESSED_PHRASE_CATEGORIES}.
+ */
+ public void setProcessedPhraseCategories(Set<LexicalCategory> processablePhraseCategories){
+ if(processablePhraseCategories == null){
+ this.processedPhraseCategories = DEFAULT_PROCESSED_PHRASE_CATEGORIES;
+ } else {
+ this.processedPhraseCategories = EnumSet.noneOf(LexicalCategory.class);
+ this.processedPhraseCategories.addAll(processablePhraseCategories);
+ }
+ }
+ /**
+ * Getter for the prococessed phrase Tags. This should be only
+ * used if the {@link PhraseTag}s used by the Chunker are not
+ * mapped to {@link LexicalCategory LexicalCategories}.
+ * @return the processed phrase tags
+ */
+ public Set<String> getProcessedPhraseTags() {
+ return processedPhraseTags;
+ }
+ /**
+ * Setter for the Processed Phrase Tags
+ * @param processedPhraseTags the set with the tags. If <code>null</code>
+ * the value is set to an empty set.
+ */
+ public void setProcessedPhraseTags(Set<String> processedPhraseTags) {
+ if(processedPhraseTags == null || processedPhraseTags.isEmpty()){
+ this.processedPhraseTags = Collections.emptySet();
+ } else {
+ this.processedPhraseTags = new HashSet<String>(processedPhraseTags);
+ }
+ }
+ /**
+ * Getter for the minimum required probability so that {@link PhraseTag}s
+ * are accepted.
+ * @return the probability [0..1)
+ */
+ public double getMinPhraseAnnotationProbability() {
+ return minPhraseAnnotationProbability;
+ }
+ /**
+ * Getter for the minimum required probability so that {@link PhraseTag}s
+ * are considered for rejecting (e.g. to skip a VerbPhrase if
+ * {@link LexicalCategory#Verb} is not present in
+ * {@link #getProcessedPhraseCategories()}). Typically this value is
+ * lower as {@link #getMinPhraseAnnotationProbability()}
+ * @return the probability [0..1)
+ */
+ public double getMinExcludePhraseAnnotationProbability() {
+ return minExcludePhraseAnnotationProbability;
+ }
+ /**
+ * Setter for the minimum phrase annotation probability [0..1)
+ * @param prob the probability [0..1) or <code>null</code> to set
+ * the {@value #DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY}
+ * @throws IllegalArgumentException if the parsed value is not
+ * in the range [0..1).
+ */
+ public void setMinPhraseAnnotationProbability(Double prob) {
+ if(prob == null){
+ this.minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
+ } else if (prob >= 1 || prob < 0){
+ throw new IllegalArgumentException("The parsed minimum phrase annotation probability '"
+ + prob +" MUST be in the range [0..1)!");
+ } else {
+ this.minPhraseAnnotationProbability = prob;
+ }
+ }
+
+ /**
+ * Setter for the minimum excluded phrase annotation probability [0..1)
+ * @param prob the probability [0..1) or <code>null</code> to set
+ * the {@value #DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY}
+ * @throws IllegalArgumentException if the parsed value is not
+ * in the range [0..1).
+ */
+ public void setMinExcludePhraseAnnotationProbability(Double prob) {
+ if(prob == null){
+ this.minExcludePhraseAnnotationProbability = DEFAULT_MIN_EXCLUDE_PHRASE_ANNOTATION_PROBABILITY;
+ } else if (prob >= 1 || prob < 0){
+ throw new IllegalArgumentException("The parsed minimum exclude phrase annotation probability '"
+ + prob +" MUST be in the range [0..1)!");
+ } else {
+ this.minExcludePhraseAnnotationProbability = prob;
+ }
+ }
+ /**
+ * Getter for the set of {@link Pos} types used for linking Entities
+ * @return the linked {@link Pos} types
+ */
+ public Set<Pos> getLinkedPos() {
+ return linkedPos;
+ }
+
+ /**
+ * If upper case Tokens should be linked regardless
+ * of the POS type and length
+ * @return
+ */
+ public boolean isLinkUpperCaseTokens(){
+ return linkUpperCaseTokensState;
+ }
+ /**
+ * Setter for the state if upper case token should be
+ * linked regardless of the POS type and length
+ * @param linkUpperCaseTokensState the state or <code>null</code>
+ * to set the {@link #DEFAULT_LINK_UPPER_CASE_TOKEN_STATE}
+ */
+ public void setLinkUpperCaseTokensState(Boolean linkUpperCaseTokensState) {
+ if(linkUpperCaseTokensState == null){
+ this.linkUpperCaseTokensState = DEFAULT_LINK_UPPER_CASE_TOKEN_STATE;
+ } else {
+ this.linkUpperCaseTokensState = linkUpperCaseTokensState;
+ }
+ }
+ /**
+ * If upper case Tokens should be matched regardless
+ * of the POS type and length
+ * @return
+ */
+ public boolean isMatchUpperCaseTokens(){
+ return matchUpperCaseTokensState;
+ }
+ /**
+ * Setter for the state if upper case token should be
+ * matched regardless of the POS type and length
+ * @param matchUpperCaseTokensState the state or <code>null</code>
+ * to set the {@link #DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE}
+ */
+ public void setMatchUpperCaseTokensState(Boolean matchUpperCaseTokensState) {
+ if(matchUpperCaseTokensState == null){
+ this.matchUpperCaseTokensState = DEFAULT_MATCH_UPPER_CASE_TOKEN_STATE;
+ } else {
+ this.matchUpperCaseTokensState = matchUpperCaseTokensState;
+ }
+ }
+ /**
+ * If {@link #isIgnoreChunks()} is disabled than this allows
+ * to convert matchable {@link Token}s to linked one in
+ * case a {@link Chunk} contains more than one matchable
+ * Token. <p>
+ * This is especially useful in cases where only
+ * {@link Pos#ProperNoun}s are processed to also detect
+ * Entities that are named by using multiple Common Nouns.
+ * In cases where all {@link LexicalCategory#Noun}s are
+ * processed this option has usually no influence on the
+ * results.
+ * @return the state
+ */
+ public boolean isLinkMultiMatchableTokensInChunk() {
+ return linkMultiMatchableTokensInChunkState;
+ }
+ /**
+ * Setter for state if for {@link Chunk}s with multiple
+ * matchable {@link Token}s those Tokens should be treated
+ * as linkable.<p>
+ * This is especially useful in cases where only
+ * {@link Pos#ProperNoun}s are linked to also detect
+ * Entities that are named by using multiple Common Nouns.
+ * In cases where all {@link LexicalCategory#Noun}s are
+ * processed this option has usually no influence on the
+ * results.
+ * @param state the state or <code>null</code> to reset to the
+ * the {@link #DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE default}
+ */
+ public void setLinkMultiMatchableTokensInChunkState(Boolean state){
+ if(state == null){
+ this.linkMultiMatchableTokensInChunkState = DEFAULT_LINK_MULTIPLE_MATCHABLE_TOKENS_IN_CHUNKS_STATE;
+ } else {
+ this.linkMultiMatchableTokensInChunkState = state;
+ }
+ }
+ /**
+ * Clones the {@link LanguageProcessingConfig}. Intended to be used
+ * to create language specific configs based on the default one.
+ */
+ @Override
+ public LanguageProcessingConfig clone() {
+ LanguageProcessingConfig c = new LanguageProcessingConfig();
+ c.ignoreChunksState = ignoreChunksState;
+ c.minExcludePhraseAnnotationProbability = minExcludePhraseAnnotationProbability;
+ c.minExcludePosAnnotationProbability = minExcludePosAnnotationProbability;
+ c.minPhraseAnnotationProbability = minPhraseAnnotationProbability;
+ c.minPosAnnotationProbability = minPosAnnotationProbability;
+ c.linkedLexicalCategories = linkedLexicalCategories;
+ c.processedPhraseCategories = processedPhraseCategories;
+ c.processedPhraseTags = processedPhraseTags;
+ c.linkedPos = linkedPos;
+ c.linkedPosTags = linkedPosTags;
+ c.linkUpperCaseTokensState = linkUpperCaseTokensState;
+ c.matchUpperCaseTokensState = matchUpperCaseTokensState;
+ c.linkMultiMatchableTokensInChunkState = linkMultiMatchableTokensInChunkState;
+ c.matchedLexicalCategories = matchedLexicalCategories;
+ return c;
+ }
+
+
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,415 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.config;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TextProcessingConfig {
+
+ private static final Logger log = LoggerFactory.getLogger(TextProcessingConfig.class);
+ /**
+ * If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link Pos#Acronym} are Matched. If
+ * deactivated all Tokens with the category {@link LexicalCategory#Noun} and
+ * {@link LexicalCategory#Residual} are considered for matching.<p>
+ * This property allows an easy configuration of the matching that is sufficient for most usage scenarios.
+ * Users that need to have more control can configure language specific mappings by using
+ * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+ * {@link #PARAM_POS_PROBABILITY} in combination with the {@link #PROCESSED_LANGUAGES}
+ * configuration.<p>
+ * The {@link #DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE default} if this is <code>false</code>
+ */
+ public static final String PROCESS_ONLY_PROPER_NOUNS_STATE = "enhancer.engines.linking.properNounsState";
+ /**
+ * Default for the {@link #PROCESS_ONLY_PROPER_NOUNS_STATE} (false)
+ */
+ public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE = false;
+ /**
+ * Allows to configure the processed languages by using the syntax supported by {@link LanguageConfiguration}.
+ * In addition this engine supports language specific configurations for matched {@link LexicalCategory}
+ * {@link Pos} and String POS tags as well as Pos annotation probabilities by using the parameters
+ * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and
+ * {@link #PARAM_POS_PROBABILITY}.<p>
+ * See the documentation of {@link LanguageConfiguration} for details of the Syntax.
+ */
+ public static final String PROCESSED_LANGUAGES = "enhancer.engines.linking.processedLanguages";
+ /*
+ * Parameters used for language specific text processing configurations
+ */
+ // (1) PHRASE level
+ /**
+ * Allows to configure the processed Chunk type (the default is
+ * <code>cc={@link LexicalCategory#Noun Noun}</code> to process only
+ * Noun Phrases). If set to <code>cc</code> (empty value) processing
+ * of chunks is deactivated.
+ */
+ public static final String PARAM_PHRASE_CATEGORIES = "pc";
+ public static final String PARAM_PHRASE_TAG = "ptag";
+ public static final String PARAM_PHRASE_PROBABILITY = "pprob";
+ public static final String PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE = "lmmtip";
+ //(2) TOKEN level
+ public static final String PARAM_LEXICAL_CATEGORIES = "lc";
+ public static final String PARAM_POS_TYPES = "pos";
+ public static final String PARAM_POS_TAG = "tag";
+ public static final String PARAM_POS_PROBABILITY = "prob";
+ /**
+ * Parameter used to configure how to deal with upper case tokens
+ */
+ public static final String PARAM_UPPER_CASE = "uc";
+ /**
+ * Enumeration defining valued for the {@link EntityLinkingEngine#PARAM_UPPER_CASE} parameter
+ */
+ public static enum UPPER_CASE_MODE {NONE,MATCH,LINK};
+ /**
+ * The default state to dereference entities set to <code>true</code>.
+ */
+ public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true;
+ /**
+ * Default set of languages. This is an empty set indicating that texts in any
+ * language are processed.
+ */
+ public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
+ public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667;
+
+ /**
+ * The languages this engine is configured to enhance. An empty List is
+ * considered as active for any language
+ */
+ private LanguageConfiguration languages = new LanguageConfiguration(PROCESSED_LANGUAGES, new String[]{"*"});
+
+ private LanguageProcessingConfig defaultConfig;
+ private Map<String,LanguageProcessingConfig> languageConfigs = new HashMap<String,LanguageProcessingConfig>();
+
+ public TextProcessingConfig(){
+ this.defaultConfig = new LanguageProcessingConfig();
+ }
+
+ public LanguageProcessingConfig getDefaults(){
+ return defaultConfig;
+ }
+ /**
+ * Getter for the language specific configuration.
+ * @param language
+ * @return the configuration sepcific to the parsed language or <code>null</code>
+ * if none.
+ */
+ public LanguageProcessingConfig getLanguageSpecificConfig(String language){
+ return languageConfigs.get(language);
+ }
+ /**
+ * Creates a language specific configuration by copying the currently configured
+ * defaults.
+ * @param language the language
+ * @return the specific configuration
+ * @throws IllegalStateException if a language specific configuration for the
+ * parsed language already exists.
+ */
+ public LanguageProcessingConfig createLanguageSpecificConfig(String language){
+ if(languageConfigs.containsKey(language)){
+ throw new IllegalStateException("A specific configuration for the language '"
+ +language+ "' does already exist!");
+ }
+ LanguageProcessingConfig conf = defaultConfig.clone();
+ languageConfigs.put(language, conf);
+ return conf;
+ }
+ /**
+ * Removes the language specific configuration for the parsed language
+ * @param language the language
+ * @return the removed configuration
+ */
+ public LanguageProcessingConfig removeLanguageSpecificConfig(String language){
+ return languageConfigs.remove(language);
+ }
+
+ /**
+ * The {@link LanguageProcessingConfig} for the parsed language
+ * or <code>null</code> if the language is not included in the
+ * configuration. This will return the {@link #getDefaults()} if
+ * the parsed language does not have a specific configuration.<p>
+ * To obtain just language specific configuration use
+ * {@link #getLanguageSpecificConfig(String)}
+ * @param language the language
+ * @return the configuration or <code>null</code> if the language is
+ * not configured to be processed.
+ */
+ public LanguageProcessingConfig getConfiguration(String language) {
+ if(languages.isLanguage(language)){
+ LanguageProcessingConfig lpc = languageConfigs.get(language);
+ return lpc == null ? defaultConfig : lpc;
+ } else {
+ return null;
+ }
+ }
+
+
+ /**
+ * Initialise the {@link TextAnalyzer} component.<p>
+ * Currently this includes the following configurations: <ul>
+ * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the
+ * default (process all languages) is used.
+ * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is
+ * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used
+ * languages based on the value of the
+ *
+ * @param configuration the OSGI component configuration
+ */
+ public final static TextProcessingConfig createInstance(Dictionary<String,Object> configuration) throws ConfigurationException {
+ TextProcessingConfig tpc = new TextProcessingConfig();
+ //Parse the default text processing configuration
+ //set the default LexicalTypes
+ Object value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE);
+ boolean properNounState;
+ if(value instanceof Boolean){
+ properNounState = ((Boolean)value).booleanValue();
+ } else if (value != null){
+ properNounState = Boolean.parseBoolean(value.toString());
+ } else {
+ properNounState = DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE;
+ }
+ if(properNounState){
+ tpc.defaultConfig.setLinkedLexicalCategories(Collections.EMPTY_SET);
+ tpc.defaultConfig.setLinkedPos(LanguageProcessingConfig.DEFAULT_LINKED_POS);
+ log.debug("> ProperNoun matching activated (matched Pos: {})",
+ tpc.defaultConfig.getLinkedPos());
+ } else {
+ tpc.defaultConfig.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES);
+ tpc.defaultConfig.setLinkedPos(Collections.EMPTY_SET);
+ log.debug("> Noun matching activated (matched LexicalCategories: {})",
+ tpc.defaultConfig.getLinkedLexicalCategories());
+ }
+ //parse the language configuration
+ value = configuration.get(PROCESSED_LANGUAGES);
+ if(value instanceof String){
+ throw new ConfigurationException(PROCESSED_LANGUAGES, "Comma separated String "
+ + "is not supported for configurung the processed languages for the because "
+ + "the comma is used as separator for values of the parameters '"
+ + PARAM_LEXICAL_CATEGORIES+"', '"+ PARAM_POS_TYPES+"'and'"+PARAM_POS_TAG
+ + "! Users need to use String[] or Collection<?> instead!");
+ }
+ tpc.languages.setConfiguration(configuration);
+ Map<String,String> defaultConfig = tpc.languages.getDefaultParameters();
+ //apply the default parameters (parameter set for the '*' or '' (empty) language
+ if(!defaultConfig.isEmpty()){
+ applyLanguageParameter(tpc.defaultConfig,null,defaultConfig);
+ }
+ //apply language specific configurations
+ for(String lang : tpc.languages.getExplicitlyIncluded()){
+ LanguageProcessingConfig lpc = tpc.defaultConfig.clone();
+ applyLanguageParameter(lpc, lang, tpc.languages.getParameters(lang));
+ tpc.languageConfigs.put(lang, lpc);
+ }
+ return tpc;
+ }
+
+ private static void applyLanguageParameter(LanguageProcessingConfig tpc, String language, Map<String,String> config) throws ConfigurationException {
+ log.info(" > parse language Configuration for language: {}",
+ language == null ? "default":language);
+ //parse Phrase level configuration
+ Set<LexicalCategory> chunkCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_CATEGORIES, LexicalCategory.class);
+ Set<String> chunkTags = parseStringTags(config.get(PARAM_PHRASE_TAG));
+ if(chunkCats.isEmpty() && config.containsKey(PARAM_PHRASE_CATEGORIES) &&
+ chunkTags.isEmpty()){
+ log.info(" + enable ignorePhrase");
+ tpc.setIgnoreChunksState(true);
+ tpc.setProcessedPhraseCategories(Collections.EMPTY_SET);
+ } else {
+ tpc.setIgnoreChunksState(false);
+ if(!chunkCats.isEmpty()){
+ log.info(" + set processable Phrase cat {}",chunkCats);
+ tpc.setProcessedPhraseCategories(chunkCats);
+ } else {
+ log.info(" - use processable Phrase cats {}",tpc.getProcessedPhraseCategories());
+ }
+ if(!chunkTags.isEmpty()) {
+ log.info(" + set processable Phrase tags {}",chunkTags);
+ tpc.setProcessedPhraseTags(chunkTags);
+ } else {
+ log.info(" - use processable Phrase tags {}",tpc.getProcessedPhraseTags());
+ }
+ }
+ Double chunkProb = parseNumber(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_PROBABILITY, Double.class);
+ if(chunkProb != null || //if explicitly set
+ config.containsKey(PARAM_PHRASE_PROBABILITY)){ //set to empty value (set default)
+ log.info(" + set min ChunkTag probability: {}", chunkProb == null ? "default" : chunkProb);
+ tpc.setMinPhraseAnnotationProbability(chunkProb);
+ tpc.setMinExcludePhraseAnnotationProbability(chunkProb == null ? null : chunkProb/2);
+ } else {
+ log.info(" - use min PhraseTag probability: {}",tpc.getMinPhraseAnnotationProbability());
+ }
+ //link multiple matchable Tokens within Chunks
+ Boolean lmmticState = parseState(config, PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE);
+ if(lmmticState != null){
+ log.info(" + set the link multi matchable tokens in Phrase state to : {}",lmmticState);
+ tpc.setLinkMultiMatchableTokensInChunkState(lmmticState);
+ } else {
+ log.info(" - use the link multi matchable tokens in Phrase state to : {}",tpc.isLinkMultiMatchableTokensInChunk());
+ }
+
+ //parse Token level configuration
+ Set<LexicalCategory> lexCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_LEXICAL_CATEGORIES, LexicalCategory.class);
+ Set<Pos> pos = parseEnumParam(config, PROCESSED_LANGUAGES, language,PARAM_POS_TYPES, Pos.class);
+ Set<String> tags = parseStringTags(config.get(PARAM_POS_TAG));
+ if(config.containsKey(PARAM_LEXICAL_CATEGORIES) ||
+ config.containsKey(PARAM_POS_TYPES) ||
+ config.containsKey(PARAM_POS_TAG)){
+ log.info(" + set Linkable Tokens: cat: {}, pos: {}, tags {}",
+ new Object[]{lexCats,pos,tags});
+ tpc.setLinkedLexicalCategories(lexCats);
+ tpc.setLinkedPos(pos);
+ tpc.setLinkedPosTags(tags);
+ } else {
+ log.info(" - use Linkable Tokens: cat: {}, pos: {}, tags {}",
+ new Object[]{tpc.getLinkedLexicalCategories(),
+ tpc.getLinkedPos(),
+ tpc.getLinkedPos()});
+ }
+ //min POS tag probability
+ Double prob = parseNumber(config,PROCESSED_LANGUAGES,language, PARAM_POS_PROBABILITY,Double.class);
+ if(prob != null || //explicitly set
+ config.containsKey(PARAM_POS_PROBABILITY)){ //set to empty value (set default)
+ log.info(" + set minimum POS tag probability: {}", prob == null ? "default" : prob);
+ tpc.setMinPosAnnotationProbability(prob);
+ tpc.setMinExcludePosAnnotationProbability(prob == null ? null : prob/2d);
+ } else {
+ log.info(" - use minimum POS tag probability: {}", tpc.getMinPosAnnotationProbability());
+ }
+ //parse upper case
+ Set<UPPER_CASE_MODE> ucMode = parseEnumParam(config, PROCESSED_LANGUAGES,language,PARAM_UPPER_CASE,UPPER_CASE_MODE.class);
+ if(ucMode.size() > 1){
+ throw new ConfigurationException(PROCESSED_LANGUAGES, "Parameter 'uc' (Upper case mode) MUST NOT be multi valued (langauge: "
+ +(language == null ? "default":language)+", parsed value='"+config.get(PARAM_UPPER_CASE)+"')!");
+ }
+ if(!ucMode.isEmpty()){
+ UPPER_CASE_MODE mode = ucMode.iterator().next();
+ log.info(" + set upper case token mode to {}", mode);
+ switch (mode) {
+ case NONE:
+ tpc.setMatchUpperCaseTokensState(false);
+ tpc.setLinkUpperCaseTokensState(false);
+ break;
+ case MATCH:
+ tpc.setMatchUpperCaseTokensState(true);
+ tpc.setLinkUpperCaseTokensState(false);
+ break;
+ case LINK:
+ tpc.setMatchUpperCaseTokensState(true);
+ tpc.setLinkUpperCaseTokensState(true);
+ break;
+ default:
+ log.warn("Unsupported {} entry {} -> set defaults",UPPER_CASE_MODE.class.getSimpleName(),mode);
+ tpc.setMatchUpperCaseTokensState(null);
+ tpc.setLinkUpperCaseTokensState(null);
+ break;
+ }
+ } else {
+ log.info(" - use upper case token mode: match={}, link={}", tpc.isMatchUpperCaseTokens(), tpc.isLinkUpperCaseTokens());
+ }
+ }
+
+ private static Boolean parseState(Map<String,String> config, String param){
+ String value = config.get(param);
+ return value == null && config.containsKey(param) ? Boolean.TRUE :
+ value != null ? new Boolean(value) : null;
+ }
+
+ private static <T extends Number> T parseNumber(Map<String,String> config,
+ String property, String language, String param, Class<T> clazz) throws ConfigurationException {
+ String paramVal = config.get(PARAM_POS_PROBABILITY);
+ if(paramVal != null && !paramVal.trim().isEmpty()){
+ try {
+ //all Number subclasses do have a String constructor!
+ return clazz.getConstructor(String.class).newInstance(paramVal.trim());
+ } catch (NumberFormatException e) {
+ throw new ConfigurationException(property, "Unable to parse "
+ + clazz.getSimpleName()+" from Parameter '"
+ + PARAM_POS_PROBABILITY+"="+paramVal.trim()
+ + "' from the "+(language == null ? "default" : language)
+ + " language configuration", e);
+ } catch (IllegalArgumentException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ } catch (SecurityException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ } catch (InvocationTargetException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ } catch (NoSuchMethodException e) {
+ throw new IllegalStateException("Unable to create new "+clazz.getSimpleName()
+ +"("+paramVal.trim()+"::String)",e);
+ }
+ }
+ return null;
+ }
+
+ private static Set<String> parseStringTags(String value) {
+ if(value == null || value.isEmpty()){
+ return Collections.emptySet();
+ } else {
+ Set<String> tags = new HashSet<String>();
+ for(String entry : value.split(",")){
+ entry = entry.trim();
+ if(!entry.isEmpty()){
+ tags.add(entry);
+ }
+ }
+ return tags;
+ }
+ }
+
+ /**
+ * Utility to parse Enum members out of a comma separated string
+ * @param config the config
+ * @param property the property (only used for error handling)
+ * @param param the key of the config used to obtain the config
+ * @param enumClass the {@link Enum} class
+ * @return the configured members of the Enum or an empty set if none
+ * @throws ConfigurationException if a configured value was not part of the enum
+ */
+ private static <T extends Enum<T>> Set<T> parseEnumParam(Map<String,String> config,
+ String property, String language, //params used for logging
+ String param,Class<T> enumClass) throws ConfigurationException {
+ Set<T> enumSet;
+ String val = config.get(param);
+ if(val == null){
+ enumSet = Collections.emptySet();
+ } else {
+ enumSet = EnumSet.noneOf(enumClass);
+ for(String entry : val.split(",")){
+ entry = entry.trim();
+ if(!entry.isEmpty()){
+ try {
+ enumSet.add(Enum.valueOf(enumClass,entry.toString()));
+ } catch (IllegalArgumentException e) {
+ throw new ConfigurationException(property,
+ "'"+entry +"' of param '"+param+"' for language '"
+ + (language == null ? "default" : language)
+ + "'is not a member of the enum "+ enumClass.getSimpleName()
+ + "(configured : '"+val+"')!" ,e);
+ }
+ }
+ }
+ }
+ return enumSet;
+ }
+
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,332 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.engine;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.StringUtils;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+/**
+ * Engine that consumes NLP processing results from the {@link AnalysedText}
+ * content part of processed {@link ContentItem}s and links them with
+ * Entities as provided by the configured {@link EntitySearcher} instance.
+ * @author Rupert Westenthaler
+ *
+ */
+public class EntityLinkingEngine implements EnhancementEngine, ServiceProperties {
+
+ private final Logger log = LoggerFactory.getLogger(EntityLinkingEngine.class);
+ /**
+ * This is used to check the content type of parsed {@link ContentItem}s for
+ * plain text
+ */
+ protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+ /**
+ * The default value for the Execution of this Engine.
+ * This Engine creates TextAnnotations that should not be processed by other Engines.
+ * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
+ * to ensure that other engines do not get confused
+ */
+ public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;
+
+ /**
+ * The name of this engine
+ */
+ protected final String name;
+ /**
+ * The entitySearcher used for linking
+ */
+ protected final EntitySearcher entitySearcher;
+ /**
+ * configuration for entity linking
+ */
+ protected final EntityLinkerConfig linkerConfig;
+ /**
+ * The label tokenizer
+ */
+ protected final LabelTokenizer labelTokenizer;
+ /**
+ * The text processing configuration
+ */
+ protected final TextProcessingConfig textProcessingConfig;
+ /**
+ * The literalFactory used to create typed literals
+ */
+ private LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ /**
+ * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
+ * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
+ * offline or not.
+ *
+ * @see #enableOfflineMode(OfflineMode)
+ * @see #disableOfflineMode(OfflineMode)
+ */
+ @org.apache.felix.scr.annotations.Reference(
+ cardinality = ReferenceCardinality.OPTIONAL_UNARY,
+ policy = ReferencePolicy.DYNAMIC,
+ bind = "enableOfflineMode",
+ unbind = "disableOfflineMode",
+ strategy = ReferenceStrategy.EVENT)
+ private OfflineMode offlineMode;
+
+ /**
+ * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
+ *
+ * @param mode
+ */
+ protected final void enableOfflineMode(OfflineMode mode) {
+ this.offlineMode = mode;
+ }
+
+ /**
+ * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
+ *
+ * @param mode
+ */
+ protected final void disableOfflineMode(OfflineMode mode) {
+ this.offlineMode = null;
+ }
+
+ /**
+ * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
+ *
+ * @return the offline state
+ */
+ protected final boolean isOfflineMode() {
+ return offlineMode != null;
+ }
+
+ /**
+ * Internal Constructor used by {@link #createInstance(EntitySearcher, LanguageProcessingConfig, EntityLinkerConfig)}
+ * @param entitySearcher The component used to lookup Entities
+ * @param textProcessingConfig The configuration on how to use the {@link AnalysedText} content part of
+ * processed {@link ContentItem}s
+ * @param linkingConfig the configuration for the EntityLinker
+ */
+ public EntityLinkingEngine(String name, EntitySearcher entitySearcher,TextProcessingConfig textProcessingConfig,
+ EntityLinkerConfig linkingConfig, LabelTokenizer labelTokenizer){
+ if(name == null || name.isEmpty()){
+ throw new IllegalArgumentException("The parsed EnhancementEngine name MUST NOT be NULL!");
+ }
+ this.name = name;
+ this.linkerConfig = linkingConfig != null ? linkingConfig : new EntityLinkerConfig();
+ this.textProcessingConfig = textProcessingConfig;
+ this.entitySearcher = entitySearcher;
+ this.labelTokenizer = labelTokenizer;
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(
+ ENHANCEMENT_ENGINE_ORDERING,
+ (Object) DEFAULT_ORDER));
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ log.info("canEnhancer {}",ci.getUri());
+ if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+ log.warn("{} '{}' is inactive because EntitySearcher does not support Offline mode!",
+ getClass().getSimpleName(),getName());
+ return CANNOT_ENHANCE;
+ }
+ String language = getLanguage(this, ci, false);
+ if(language == null || textProcessingConfig.getConfiguration(language) == null){
+ log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
+ new Object[]{ getName(), ci.getUri(), language});
+ return CANNOT_ENHANCE;
+ }
+ //we need a detected language, the AnalyzedText contentPart with
+ //Tokens.
+ AnalysedText at = getAnalysedText(this, ci, false);
+ return at != null && at.getTokens().hasNext() ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ log.info(" enhance ci {}",ci.getUri());
+ if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+ throw new EngineException(this,ci,"Offline mode is not supported by the used EntitySearcher!",null);
+ }
+ AnalysedText at = getAnalysedText(this, ci, true);
+ log.info(" > AnalysedText {}",at);
+ String language = getLanguage(this, ci, true);
+ if(log.isDebugEnabled()){
+ log.debug("computeEnhancements for ContentItem {} language {} text={}",
+ new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
+ }
+ log.debug(" > Language {}",language);
+ LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
+ if(languageConfig == null){
+ throw new IllegalStateException("The language '"+language+"' is not configured "
+ + "to be processed by this Engine. As this is already checked within the "
+ + "canEnhance(..) method this may indicate an bug in the used "
+ + "EnhanceemntJobManager implementation!");
+ }
+ EntityLinker entityLinker = new EntityLinker(at,language,
+ languageConfig, entitySearcher, linkerConfig, labelTokenizer);
+ //process
+ entityLinker.process();
+ //write results (requires a write lock)
+ ci.getLock().writeLock().lock();
+ try {
+ writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+
+ /**
+ * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
+ * extracted from the parsed ContentItem
+ * @param ci
+ * @param linkedEntities
+ * @param language
+ */
+ private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
+ Language languageObject = null;
+ if(language != null && !language.isEmpty()){
+ languageObject = new Language(language);
+ }
+ Set<UriRef> dereferencedEntitis = new HashSet<UriRef>();
+ MGraph metadata = ci.getMetadata();
+ for(LinkedEntity linkedEntity : linkedEntities){
+ Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
+ //first create the TextAnnotations for the Occurrences
+ for(Occurrence occurrence : linkedEntity.getOccurrences()){
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ textAnnotations.add(textAnnotation);
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_START,
+ literalFactory.createTypedLiteral(occurrence.getStart())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_END,
+ literalFactory.createTypedLiteral(occurrence.getEnd())));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(occurrence.getContext(),languageObject)));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
+ metadata.add(new TripleImpl(textAnnotation,
+ Properties.ENHANCER_CONFIDENCE,
+ literalFactory.createTypedLiteral(linkedEntity.getScore())));
+ for(UriRef dcType : linkedEntity.getTypes()){
+ metadata.add(new TripleImpl(
+ textAnnotation, Properties.DC_TYPE, dcType));
+ }
+ }
+ //now the EntityAnnotations for the Suggestions
+ for(Suggestion suggestion : linkedEntity.getSuggestions()){
+ UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
+ //should we use the label used for the match, or search the
+ //representation for the best label ... currently its the matched one
+ Text label = suggestion.getBestLabel(linkerConfig.getNameField(),language);
+ Representation rep = suggestion.getRepresentation();
+ UriRef uri = new UriRef(rep.getId());
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_ENTITY_LABEL,
+ label.getLanguage() == null ?
+ new PlainLiteralImpl(label.getText()) :
+ new PlainLiteralImpl(label.getText(),
+ new Language(label.getLanguage()))));
+ metadata.add(new TripleImpl(entityAnnotation,ENHANCER_ENTITY_REFERENCE,uri));
+ Iterator<Reference> suggestionTypes = rep.getReferences(linkerConfig.getTypeField());
+ while(suggestionTypes.hasNext()){
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
+ }
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
+ for(UriRef textAnnotation : textAnnotations){
+ metadata.add(new TripleImpl(entityAnnotation,
+ Properties.DC_RELATION, textAnnotation));
+ }
+ //add origin information of the EntiySearcher
+ for(Entry<UriRef,Collection<Resource>> originInfo : entitySearcher.getOriginInformation().entrySet()){
+ for(Resource value : originInfo.getValue()){
+ metadata.add(new TripleImpl(entityAnnotation,
+ originInfo.getKey(),value));
+ }
+ }
+ //in case dereferencing of Entities is enabled we need also to
+ //add the RDF data for entities
+ if(linkerConfig.isDereferenceEntitiesEnabled() &&
+ dereferencedEntitis.add(uri)){ //not yet dereferenced
+ metadata.addAll(
+ RdfValueFactory.getInstance().toRdfRepresentation(
+ suggestion.getRepresentation()).getRdfGraph());
+ }
+ }
+ }
+ }
+
+}