You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/09/22 08:51:36 UTC
svn commit: r1173968 [3/5] - in /incubator/stanbol/trunk: commons/installer/bundleprovider/src/main/java/org/apache/stanbol/commons/installer/provider/bundle/impl/ commons/jsonld/ commons/opennlp/ commons/opennlp/src/main/java/org/apache/stanbol/common...

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,676 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.PropertyOption;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
+import org.apache.felix.scr.annotations.ReferenceStrategy;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer;
+import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.AnalysedContent;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinker;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.LinkedEntity.Occurrence;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.EntityhubSearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(configurationFactory = true, policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
+    specVersion = "1.1", metatype = true, immediate = true)
+@Service
+@org.apache.felix.scr.annotations.Properties(value={
+    @Property(name=KeywordLinkingEngine.REFERENCED_SITE_ID),
+    @Property(name=KeywordLinkingEngine.NAME_FIELD,value=EntityLinkerConfig.DEFAULT_NAME_FIELD),
+    @Property(name=KeywordLinkingEngine.TYPE_FIELD,value=EntityLinkerConfig.DEFAULT_TYPE_FIELD),
+    @Property(name=KeywordLinkingEngine.REDIRECT_FIELD,value=EntityLinkerConfig.DEFAULT_REDIRECT_FIELD),
+    //@Property(name=TaxonomyLinkingEngine2.SIMPLE_TOKENIZER,boolValue=true),
+    //@Property(name=TaxonomyLinkingEngine2.ENABLE_CHUNKER,boolValue=false),
+    @Property(name=KeywordLinkingEngine.REDIRECT_PROCESSING_MODE,options={
+        @PropertyOption(
+            value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.ignore",
+            name="IGNORE"),
+        @PropertyOption(
+            value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.addValues",
+            name="ADD_VALUES"),
+        @PropertyOption(
+                value='%'+KeywordLinkingEngine.REDIRECT_PROCESSING_MODE+".option.follow",
+                name="FOLLOW")
+        },value="FOLLOW"),
+    @Property(name=KeywordLinkingEngine.MIN_SEARCH_TOKEN_LENGTH,
+        intValue=EntityLinkerConfig.DEFAULT_MIN_SEARCH_TOKEN_LENGTH),
+    @Property(name=KeywordLinkingEngine.MAX_SUGGESTIONS,
+        intValue=EntityLinkerConfig.DEFAULT_SUGGESTIONS),
+    @Property(name=KeywordLinkingEngine.PROCESSED_LANGUAGES,value="")
+})
+public class KeywordLinkingEngine implements EnhancementEngine, ServiceProperties{
+
+    private final Logger log = LoggerFactory.getLogger(KeywordLinkingEngine.class);
+    /**
+     * This is used to check the content type of parsed {@link ContentItem}s for
+     * plain text
+     */
+    protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+    /**
+     * The default value for the Execution of this Engine.
+     * This Engine creates TextAnnotations that should not be processed by other Engines.
+     * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
+     * to ensure that other engines do not get confused
+     */
+    public static final Integer DEFAULT_ORDER = ServiceProperties.ORDERING_DEFAULT - 10;
+
+    
+    public static final String REFERENCED_SITE_ID = "org.apache.stanbol.enhancer.engines.keywordextraction.referencedSiteId";
+    public static final String NAME_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.nameField";
+    public static final String TYPE_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.typeField";
+    public static final String REDIRECT_FIELD = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectField";
+    public static final String REDIRECT_PROCESSING_MODE = "org.apache.stanbol.enhancer.engines.keywordextraction.redirectMode";
+    public static final String MIN_SEARCH_TOKEN_LENGTH = "org.apache.stanbol.enhancer.engines.keywordextraction.minSearchTokenLength";
+    public static final String MAX_SUGGESTIONS = "org.apache.stanbol.enhancer.engines.keywordextraction.maxSuggestions";
+    public static final String PROCESSED_LANGUAGES = "org.apache.stanbol.enhancer.engines.keywordextraction.processedLanguages";
+    public static final String MIN_FOUND_TOKENS= "org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens";
+//  public static final String SIMPLE_TOKENIZER = "org.apache.stanbol.enhancer.engines.keywordextraction.simpleTokenizer";
+//  public static final String ENABLE_CHUNKER = "org.apache.stanbol.enhancer.engines.keywordextraction.enableChunker";
+    /**
+     * Default set of languages. This is an empty set indicating that texts in any
+     * language are processed. 
+     */
+    public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet();
+    /**
+     * The languages this engine is configured to enhance. An empty List is
+     * considered as active for any language
+     */
+    private Set<String> languages = DEFAULT_LANGUAGES;
+    /**
+     * The literal representing the LangIDEngine as creator.
+     */
+    public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
+    
+    private EntitySearcher entitySearcher;
+    private EntityLinkerConfig config;
+    
+    /**
+     * The reference to the OpenNLP component
+     */
+    @org.apache.felix.scr.annotations.Reference
+    private OpenNLP openNLP;
+    /**
+     * Used for natural language processing of parsed content
+     */
+    private TextAnalyzer textAnalyser;
+    /**
+     * Used to create {@link AnalysedContent} instances for parsed content items
+     */
+    private OpenNlpAnalysedContentFactory analysedContentFactory;
+    /**
+     * The literalFactory used to create typed literals
+     */
+    private LiteralFactory literalFactory = LiteralFactory.getInstance();
+    
+    /**
+     * The {@link OfflineMode} is used by Stanbol to indicate that no external service should be referenced.
+     * For this engine that means it is necessary to check if the used {@link ReferencedSite} can operate
+     * offline or not.
+     * 
+     * @see #enableOfflineMode(OfflineMode)
+     * @see #disableOfflineMode(OfflineMode)
+     */
+    @org.apache.felix.scr.annotations.Reference(
+        cardinality = ReferenceCardinality.OPTIONAL_UNARY, 
+        policy = ReferencePolicy.DYNAMIC, 
+        bind = "enableOfflineMode", 
+        unbind = "disableOfflineMode", 
+        strategy = ReferenceStrategy.EVENT)
+    private OfflineMode offlineMode;
+
+    /**
+     * Called by the ConfigurationAdmin to bind the {@link #offlineMode} if the service becomes available
+     * 
+     * @param mode
+     */
+    protected final void enableOfflineMode(OfflineMode mode) {
+        this.offlineMode = mode;
+    }
+
+    /**
+     * Called by the ConfigurationAdmin to unbind the {@link #offlineMode} if the service becomes unavailable
+     * 
+     * @param mode
+     */
+    protected final void disableOfflineMode(OfflineMode mode) {
+        this.offlineMode = null;
+    }
+
+    /**
+     * Returns <code>true</code> only if Stanbol operates in {@link OfflineMode}.
+     * 
+     * @return the offline state
+     */
+    protected final boolean isOfflineMode() {
+        return offlineMode != null;
+    }
+    
+    /**
+     * Default constructor as used by OSGI. This expects that 
+     * {@link #activate(ComponentContext)} is called before usage
+     */
+    public KeywordLinkingEngine() {
+    }
+    /**
+     * Internal Constructor used by {@link #createInstance(OpenNLP, EntitySearcher, EntityLinkerConfig)}
+     * @param openNLP
+     * @param entitySearcher
+     * @param config
+     */
+    protected KeywordLinkingEngine(OpenNLP openNLP,EntitySearcher entitySearcher,
+                                     EntityLinkerConfig config){
+        this.openNLP = openNLP;
+        this.textAnalyser = new TextAnalyzer(openNLP);
+        this.analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
+        this.entitySearcher = entitySearcher;
+        this.config = config != null ? config : new EntityLinkerConfig();
+    }
+    /**
+     * Allows to create an instance that can be used outside of an OSGI
+     * environment. This is mainly intended for unit tests.
+     * @param openNLP The {@link OpenNLP} instance used for natural language processing
+     * @param entitySearcher the searcher used to lookup terms
+     * @param config the configuration or <code>null</code> to use the defaults
+     * @return the created engine instance
+     */
+    public static KeywordLinkingEngine createInstance(OpenNLP openNLP,
+                                                        EntitySearcher entitySearcher,
+                                                        EntityLinkerConfig config){
+        return new KeywordLinkingEngine(openNLP,entitySearcher,config);
+    }
+
+
+    /**
+     * Checks if the parsed language is enabled for processing.
+     * @param language The language to process
+     * @return the processing state for the parsed language.
+     */
+    protected boolean isProcessableLanguages(String language) {
+        return languages.isEmpty() || languages.contains(language);
+    }
+    
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return Collections.unmodifiableMap(Collections.singletonMap(
+            ENHANCEMENT_ENGINE_ORDERING,
+            (Object) DEFAULT_ORDER));
+    }
+
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        String mimeType = ci.getMimeType().split(";", 2)[0];
+        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
+            return ENHANCE_SYNCHRONOUS;
+        }
+        // check for existence of textual content in metadata
+        UriRef subj = new UriRef(ci.getId());
+        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
+        if (it.hasNext()) {
+            return ENHANCE_SYNCHRONOUS;
+        }
+        return CANNOT_ENHANCE;
+    }
+
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
+            throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
+        }
+        String mimeType = ci.getMimeType().split(";", 2)[0];
+        String text = extractText(ci, mimeType);
+        if (text.trim().length() == 0) {
+            // TODO: make the length of the data a field of the ContentItem
+            // interface to be able to filter out empty items in the canEnhance
+            // method
+            log.warn("nothing to extract knowledge from in ContentItem {}", ci);
+            return;
+        }
+        //Determine the language
+        String language = extractLanguage(ci);
+        if(isProcessableLanguages(language)){
+            log.debug("computeEnhancements for ContentItem {} language {} text={}", 
+                new Object []{ci.getId(), language, StringUtils.abbreviate(text, 100)});
+            
+            EntityLinker taxonomyLinker = new EntityLinker(
+                analysedContentFactory.create(text, language),
+                entitySearcher, config);
+            //process
+            taxonomyLinker.process();
+            //write results
+            writeEnhancements(ci, taxonomyLinker.getLinkedEntities().values(), language);
+        } else {
+            log.debug("ignore ContentItem {} because language '{}' is not configured to" +
+            		"be processed by this engine.",ci.getId(),language);
+        }
+        
+    }
+
+    /**
+     * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
+     * extracted from the parsed ContentItem
+     * @param ci
+     * @param linkedEntities
+     * @param language
+     */
+    private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
+        MGraph metadata = ci.getMetadata();
+        for(LinkedEntity linkedEntity : linkedEntities){
+            Collection<UriRef> textAnnotations = new ArrayList<UriRef>(linkedEntity.getOccurrences().size());
+            //first create the TextAnnotations for the Occurrences
+            for(Occurrence occurrence : linkedEntity.getOccurrences()){
+                UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                textAnnotations.add(textAnnotation);
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_START, 
+                    literalFactory.createTypedLiteral(occurrence.getStart())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_END, 
+                    literalFactory.createTypedLiteral(occurrence.getEnd())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_SELECTION_CONTEXT, 
+                    literalFactory.createTypedLiteral(occurrence.getContext())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_SELECTED_TEXT, 
+                    literalFactory.createTypedLiteral(occurrence.getSelectedText())));
+                metadata.add(new TripleImpl(textAnnotation, 
+                    Properties.ENHANCER_CONFIDENCE, 
+                    literalFactory.createTypedLiteral(linkedEntity.getScore())));
+                for(UriRef dcType : linkedEntity.getTypes()){
+                    metadata.add(new TripleImpl(
+                        textAnnotation, Properties.DC_TYPE, dcType));
+                }
+            }
+            //now the EntityAnnotations for the Suggestions
+            for(Suggestion suggestion : linkedEntity.getSuggestions()){
+                UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
+                //should we use the label used for the match, or search the
+                //representation for the best label ... currently its the matched one
+                Text label = suggestion.getBestLabel(config.getNameField(),language);
+                metadata.add(new TripleImpl(entityAnnotation, 
+                    Properties.ENHANCER_ENTITY_LABEL, 
+                    label.getLanguage() == null ?
+                            new PlainLiteralImpl(label.getText()) :
+                                new PlainLiteralImpl(label.getText(),
+                                    new Language(label.getLanguage()))));
+                metadata.add(new TripleImpl(entityAnnotation, 
+                    Properties.ENHANCER_ENTITY_REFERENCE, 
+                    new UriRef(suggestion.getRepresentation().getId())));
+                Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(config.getTypeField());
+                while(suggestionTypes.hasNext()){
+                    metadata.add(new TripleImpl(entityAnnotation, 
+                        Properties.ENHANCER_ENTITY_TYPE, new UriRef(suggestionTypes.next().getReference())));
+                }
+                metadata.add(new TripleImpl(entityAnnotation,
+                    Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
+                for(UriRef textAnnotation : textAnnotations){
+                    metadata.add(new TripleImpl(entityAnnotation, 
+                        Properties.DC_RELATION, textAnnotation));
+                }
+            }
+        }
+    }
+    /**
+     * Extracts the language of the parsed ContentItem from the metadata
+     * @param ci the content item
+     * @return the language
+     */
+    private String extractLanguage(ContentItem ci) {
+        MGraph metadata = ci.getMetadata();
+        Iterator<Triple> langaugeEnhancementCreatorTriples = 
+            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
+        if(langaugeEnhancementCreatorTriples.hasNext()){
+            String lang = EnhancementEngineHelper.getString(metadata, 
+                langaugeEnhancementCreatorTriples.next().getSubject(), 
+                Properties.DC_LANGUAGE);
+            if(lang != null){
+                return lang;
+            } else {
+                log.warn("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
+                    new Object[]{ci.getId(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
+                log.warn(" ... return 'en' as default");
+                return "en";
+            }
+        } else {
+            log.warn("Unable to extract language for ContentItem %s! Is the %s active?",
+                ci.getId(),LANG_ID_ENGINE_NAME.getLexicalForm());
+            log.warn(" ... return 'en' as default");
+            return "en";
+        }
+    }
+
+    /**
+     * Extracts the text from the parsed contentItem. In case the content type is
+     * plain text, it directly reads the text from the stream. In other cases it
+     * tries to read the string representation from the metadata by looking for
+     * values of the {@link org.apache.stanbol.enhancer.servicesapi.rdf.Properties#NIE_PLAINTEXTCONTENT}
+     * property.<p>
+     * TODO: This is a Workaround for the currently not implemented Adapter
+     * Pattern for the Stanbol Enhancer.
+     * @param ci
+     * @param mimeType
+     * @return
+     * @throws InvalidContentException
+     */
+    private String extractText(ContentItem ci, String mimeType) throws InvalidContentException {
+        String text;
+        if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
+            try {
+                text = IOUtils.toString(ci.getStream(),"UTF-8");
+            } catch (IOException e) {
+                throw new InvalidContentException(this, ci, e);
+            }
+        } else {
+            //TODO: change that as soon the Adapter Pattern is used for multiple
+            // mimetype support.
+            StringBuilder textBuilder = new StringBuilder();
+            Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getId()), NIE_PLAINTEXTCONTENT, null);
+            while (it.hasNext()) {
+                textBuilder.append(it.next().getObject());
+            }
+            text = textBuilder.toString();
+        }
+        return text;
+    }
+    
+    /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+     * Methods for activate() and deactivate() the properties configureable via
+     * OSGI.
+     * 
+     * NOTEs:
+     * Directly calling super.activate and super.deactivate
+     * is possible but might not be applicable in all cases.
+     * The activate**(...) and deactivate**() Methods are intended to be
+     * called by subclasses that need more control over the initialisation
+     * process.
+     * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+     */
+    /**
+     * Activates this Engine. Subclasses should not call this method but rather
+     * call<ul>
+     * <li> {@link #activateEntitySearcher(ComponentContext, Dictionary)}
+     * <li> {@link #initEntityLinkerConfig(Dictionary, EntityLinkerConfig)} and
+     * <li> {@link #activateProcessedLanguages(Dictionary)}
+     * </ul>
+     * if applicable.
+     * @param context the Component context
+     * @throws ConfigurationException if the required {@link #REFERENCED_SITE_ID}
+     * configuration is missing or any of the other properties has an illegal value
+     */
+    @Activate
+    @SuppressWarnings("unchecked")
+    protected void activate(ComponentContext context) throws ConfigurationException {
+        textAnalyser = new TextAnalyzer(openNLP);
+        analysedContentFactory = OpenNlpAnalysedContentFactory.getInstance(textAnalyser);
+        Dictionary<String,Object> properties = context.getProperties();
+        activateEntitySearcher(context, properties);
+        activateEntityLinkerConfig(properties);
+        activateProcessedLanguages(properties);
+    }
+
+    /**
+     * Initialise the processed languages based on the value of the
+     * {@link #PROCESSED_LANGUAGES} key. If no configuration is present the
+     * default (process all languages) is used.
+     * @param configuration the OSGI component configuration
+     */
+    protected final void activateProcessedLanguages(Dictionary<String,Object> configuration) {
+        Object value;
+        value = configuration.get(PROCESSED_LANGUAGES);
+        if(value == null){
+            this.languages = DEFAULT_LANGUAGES;
+        } else if (value.toString().trim().isEmpty()){
+            this.languages = Collections.emptySet();
+        } else {
+            String[] languageArray = value.toString().split(",");
+            languages = new HashSet<String>();
+            for(String language : languageArray){
+                if(language != null){
+                    language = language.trim();
+                    if(!language.isEmpty()){
+                        languages.add(language);
+                    }
+                }
+            }
+        }
+    }
+
+    /**
+     * Configures the parsed {@link EntityLinkerConfig} with the values of the
+     * following properties:<ul>
+     * <li>{@link #NAME_FIELD}
+     * <li>{@link #TYPE_FIELD}
+     * <li>{@link #REDIRECT_FIELD}
+     * <li>{@link #REDIRECT_PROCESSING_MODE}
+     * <li>{@link #MAX_SUGGESTIONS}
+     * <li>{@link #MIN_SEARCH_TOKEN_LENGTH}
+     * <li>{@link #MIN_FOUND_TOKENS}
+     * </ul>
+     * This Method create an new {@link EntityLinkerConfig} instance only if
+     * <code>{@link #config} == null</code>. If the instance is already initialised
+     * that all current values for keys missing in the parsed configuration are
+     * preserved.
+     * @param configuration the configuration
+     * @throws ConfigurationException In case of an illegal value in the parsed configuration.
+     * Note that all configuration are assumed as optional, therefore missing values will not
+     * case a ConfigurationException.
+     */
+    protected void activateEntityLinkerConfig(Dictionary<String,Object> configuration) throws ConfigurationException {
+        if(config == null){
+            this.config = new EntityLinkerConfig();
+        }
+        Object value;
+        value = configuration.get(NAME_FIELD);
+        if(value != null){
+            if(value.toString().isEmpty()){
+                throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
+            }
+            config.setNameField(value.toString());
+        }
+        //init TYPE_FIELD
+        value = configuration.get(TYPE_FIELD);
+        if(value != null){
+            if(value.toString().isEmpty()){
+                throw new ConfigurationException(TYPE_FIELD,"The configured name field MUST NOT be empty");
+            }
+            config.setTypeField(value.toString());
+        }
+        //init REDIRECT_FIELD
+        value = configuration.get(REDIRECT_FIELD);
+        if(value != null){
+            if(value.toString().isEmpty()){
+                throw new ConfigurationException(NAME_FIELD,"The configured name field MUST NOT be empty");
+            }
+            config.setRedirectField(value.toString());
+        }
+        //init MAX_SUGGESTIONS
+        value = configuration.get(MAX_SUGGESTIONS);
+        Integer maxSuggestions;
+        if(value instanceof Integer){
+            maxSuggestions = (Integer)value;
+        } else if (value != null){
+            try {
+                maxSuggestions = Integer.valueOf(value.toString());
+            } catch(NumberFormatException e){
+                throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0",e);
+            }
+        } else {
+            maxSuggestions = null;
+        }
+        if(maxSuggestions != null){
+            if(maxSuggestions < 1){
+                throw new ConfigurationException(MAX_SUGGESTIONS, "Values MUST be valid Integer values > 0");
+            }
+            config.setMaxSuggestions(maxSuggestions);
+        }
+        //init MIN_FOUND_TOKENS
+        value = configuration.get(MIN_FOUND_TOKENS);
+        Integer minFoundTokens;
+        if(value instanceof Integer){
+            minFoundTokens = (Integer)value;
+        } else if(value != null){
+            try {
+                minFoundTokens = Integer.valueOf(value.toString());
+            } catch(NumberFormatException e){
+                throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0",e);
+            }
+        } else {
+            minFoundTokens = null;
+        }
+        if(minFoundTokens != null){
+            if(minFoundTokens < 1){
+                throw new ConfigurationException(MIN_FOUND_TOKENS, "Values MUST be valid Integer values > 0");
+            }
+            config.setMinFoundTokens(minFoundTokens);
+        }
+        // init MIN_SEARCH_TOKEN_LENGTH
+        value = configuration.get(MIN_SEARCH_TOKEN_LENGTH);
+        Integer minSearchTokenLength;
+        if(value instanceof Integer){
+            minSearchTokenLength = (Integer)value;
+        } else if (value != null){
+            try {
+                minSearchTokenLength = Integer.valueOf(value.toString());
+            } catch(NumberFormatException e){
+                throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e);
+            }
+        } else {
+            minSearchTokenLength = null;
+        }
+        if(minSearchTokenLength != null){
+            if(minSearchTokenLength < 1){
+                throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0");
+            }
+            config.setMaxSuggestions(minSearchTokenLength);
+        }
+        //init the REDIRECT_PROCESSING_MODE
+        value = configuration.get(REDIRECT_PROCESSING_MODE);
+        if(value != null){
+            try {
+                config.setRedirectProcessingMode(RedirectProcessingMode.valueOf(value.toString()));
+            } catch (IllegalArgumentException e) {
+                throw new ConfigurationException(REDIRECT_PROCESSING_MODE, "Values MUST be one of "+
+                    Arrays.toString(RedirectProcessingMode.values()));
+            }
+        }
+    }
+
+    /**
+     * Initialise the {@link #entitySearcher} based on the value of the
+     * {@link #REFERENCED_SITE_ID} property in the parsed configuration
+     * @param context
+     * @param configuration
+     * @throws ConfigurationException
+     */
+    protected void activateEntitySearcher(ComponentContext context, Dictionary<String,Object> configuration) throws ConfigurationException {
+        Object value = configuration.get(REFERENCED_SITE_ID);
+        //init the EntitySource
+        if (value == null) {
+            throw new ConfigurationException(REFERENCED_SITE_ID,
+                    "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!");
+        }
+        String refSiteId = value.toString();
+        if (refSiteId.isEmpty()) {
+            throw new ConfigurationException(REFERENCED_SITE_ID,
+                    "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
+        }
+        if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
+            entitySearcher = new EntityhubSearcher(context.getBundleContext());
+        } else {
+            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId);
+        }
+    }
+    /**
+     * Deactivates this Engine. Subclasses should not call this method but rather
+     * call<ul>
+     * <li> {@link #deactivateEntitySearcher()}
+     * <li> {@link #deactivateEntityLinkerConfig()} and
+     * <li> {@link #deactivateProcessedLanguages())}
+     * </ul>
+     * @param context the context (not used)
+     */
+    @Deactivate
+    protected void deactivate(ComponentContext context) {
+        deactivateEntitySearcher();
+        deactivateProcessedLanguages();
+        deactivateEntityLinkerConfig();
+    }
+
+    /**
+     * Sets the languages to {@link #DEFAULT_LANGUAGES}
+     */
+    protected void deactivateProcessedLanguages() {
+        languages = DEFAULT_LANGUAGES;
+    }
+
+    /**
+     * sets the {@link EntityLinkerConfig} to <code>null</code>
+     */
+    protected void deactivateEntityLinkerConfig() {
+        config = null;
+    }
+
+    /**
+     * Closes and resets the EntitySearcher. Also calls
+     * {@link TrackingEntitySearcher#close()} if applicable. 
+     */
+    protected void deactivateEntitySearcher() {
+        if(entitySearcher instanceof TrackingEntitySearcher<?>){
+            //close tracking EntitySearcher
+            ((TrackingEntitySearcher<?>)entitySearcher).close();
+        }
+        entitySearcher = null;
+    }
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,224 @@
+/**
+ * 
+ */
+package org.apache.stanbol.enhancer.engines.keywordextraction.impl;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+
+public class ProcessingState {
+
+    private final Iterator<AnalysedText> sentences;
+    /**
+     * The sentence currently processed
+     */
+    private AnalysedText sentence;
+    /**
+     * The index of the current token needed to be linked
+     */
+    private int tokenIndex = -1;
+    /**
+     * The current token
+     */
+    private Token token;
+    /**
+     * The iterator over the chunks of the current {@link #sentence}  
+     * or <code>null</code> if no {@link Chunk}s are available.
+     */
+    private Iterator<Chunk> chunks;
+    /**
+     * The current {@link Chunk}
+     */
+    private Chunk chunk;
+    /**
+     * This is a cache over the exact labels over the following 'n' tokens
+     * relative {@link #tokenIndex}. It is cleared each time {@link #next()}
+     * is called. 
+     */
+    private Map<Integer,String> textCache = new HashMap<Integer,String>();
+    /**
+     * The position for the next token
+     */
+    private int nextToken = -1;
+
+    public ProcessingState(Iterator<AnalysedText> sentences){
+        this.sentences = sentences;
+        if(!sentences.hasNext()){
+            throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT have an empty AnalysedText iterator!");
+        }
+    }
+    /**
+     * Getter for the current Sentence
+     * @return the sentence
+     */
+    public final AnalysedText getSentence() {
+        return sentence;
+    }
+    /**
+     * Getter for the index of the current active token within the current
+     * active {@link #getSentence() sentence}
+     * @return the tokenPos the index of the token
+     */
+    public final int getTokenIndex() {
+        return tokenIndex;
+    }
+    /**
+     * The currently active token
+     * @return the token
+     */
+    public final Token getToken() {
+        return token;
+    }
+    /**
+     * The currently active chunk or <code>null</code> if no chunks are
+     * available. If chunks are present this can not be <code>null</code>
+     * because {@link Token}s outside of chunks are skiped.
+     * @return the chunk the current {@link Chunk} or <code>null</code> if
+     * no chunks are present.
+     */
+    public final Chunk getChunk() {
+        return chunk;
+    }
+    /**
+     * Getter for the next {@link Token} to be processed. Calling {@link #next()}
+     * is guaranteed to skip all tokens in between {@link #getTokenIndex()}
+     * and {@link #getNextToken()}, but it might even skip more tokens (e.g.
+     * in case that the token referenced by {@link #getNextToken()} is not
+     * within a {@link Chunk}
+     * @return the nextToken
+     */
+    public final int getNextToken() {
+        return nextToken;
+    }
+    /**
+     * Allows to manually set to position of the next token to process.
+     * This can be used to skip some tokens within (e.g. if a Concept
+     * matching multiple Tokens where found.<p>
+     * The set token may be greater than the number of tokens in 
+     * {@link #sentence}. This will simple cause the next sentence to be
+     * activated on the next call to {@link #next()}
+     * @param pos the position of the next token to process. 
+     */
+    public void setNextToken(int pos){
+        if(pos > tokenIndex){
+            this.nextToken = pos;
+        } else {
+            throw new IllegalArgumentException("The nextTokenPos "+pos+
+                " MUST BE greater than the current "+tokenIndex);
+        }
+    }
+    /**
+     * Moves the state to #nextToken this may switch to the next Chunk or
+     * sentence.
+     * @return <code>true</code> if there are further elements to process or
+     * <code>false</code> if there are no further elements to process.
+     */
+    public boolean next() {
+        //first clear caches for the current element
+        textCache.clear();
+        //switch to the next token
+        if(nextToken > tokenIndex){
+            tokenIndex = nextToken;
+        } else {
+            tokenIndex++;
+            nextToken = tokenIndex;
+        }
+        //now init the next element
+        final boolean hasNext;
+        if(chunk != null){ //if chunks are present
+            //get next chunk (may be the current if chunk.getEnd() > tokenPos
+            for(;tokenIndex > chunk.getEnd() && chunks.hasNext();chunk = chunks.next());
+            if(tokenIndex <= chunk.getEnd()){ //found valid chunk
+                if(chunk.getStart() > tokenIndex) { //skip tokens outside chunks
+                    tokenIndex = chunk.getStart();
+                }
+                hasNext = true;
+            } else { //no more valid chunks in this sentence
+                hasNext = initNextSentence();
+            }
+        } else { //no chunks ... use tokens only
+            if(sentence == null){ //first sentence
+                hasNext = initNextSentence();
+            } else if(tokenIndex >= sentence.getTokens().size()){
+                hasNext = initNextSentence();
+            } else { //more tokens in the sentence
+                //set the token
+                hasNext = true;
+            }
+        }
+        if(hasNext){ //set the Token
+            token = sentence.getTokens().get(tokenIndex);
+        }
+        return hasNext;
+    }
+
+    /**
+     * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk}
+     * and {@link #tokenIndex} for the next element of {@link #sentences}. If
+     * no further sentences are to process it simple sets {@link #sentence}, 
+     * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code>
+     */
+    private boolean initNextSentence() {
+        sentence = null;
+        while(sentence == null && sentences.hasNext()){
+            sentence = sentences.next();
+            if(sentence.getChunks() != null){
+                chunks = sentence.getChunks().iterator();
+                if(chunks.hasNext()){
+                    chunk = chunks.next();
+                    tokenIndex = chunk.getStart();
+                    nextToken = tokenIndex;
+                } else { //no chunks in this sentence
+                    sentence = null; //skip this sentence
+                }
+            } else {
+                if(sentence.getTokens().isEmpty()){ //no tokens in this sentence
+                    sentence = null; //skip this one
+                } else {
+                    chunks = null;
+                    chunk = null;
+                    tokenIndex = 0;
+                    nextToken = 0;
+                }
+            }
+        }
+        return sentence != null;
+    }
+    /**
+     * Getter for the text covered by the next tokenCount tokens relative to
+     * {@link #token}. It uses the {@link #textCache} to lookup/store such texts.
+     * Given the Tokens
+     * <pre>
+     *    [This, is, an, Example]
+     * </pre>
+     * and the parameter <code>3</code> this method will return
+     * <pre>
+     *     This is an
+     * </pre>
+     * @param tokenCount the number of tokens to be included relative to 
+     * {@link #tokenIndex}
+     * @return the text covered by the span start of {@link #token} to end of
+     * token at <code>{@link #tokenIndex}+tokenCount</code>.
+     */
+    public String getTokenText(int tokenCount){
+        Integer pos = Integer.valueOf(tokenCount-1);
+        String text = textCache.get(Integer.valueOf(tokenCount-1));
+        if(text == null){
+            text = sentence.getText().substring(token.getStart(),
+                sentence.getTokens().get(tokenIndex+pos.intValue()).getEnd());
+            textCache.put(pos, text);
+        }
+        return text;
+    }
+    @Override
+    public String toString() {
+        return "["+tokenIndex+","+token+"] chunk: " +
+            (chunk == null?null:chunk.getText())+"| sentence: "+
+            (sentence == null?null:sentence.getText());
+    }
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/ProcessingState.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,63 @@
+/**
+ * 
+ */
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Iterator;
+
+import opennlp.tools.util.Span;
+
+import org.apache.stanbol.commons.opennlp.TextAnalyzer;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+
+/**
+ * Represents the already with NLP tools analysed content to be linked with
+ * Entities of an {@link EntitySearcher}.<p>
+ * Note that for the linking process it is only required that the text is
+ * tokenized. All other features (sentence detection, POS tags and Chunks) are
+ * optional but do improve the performance and to an smaller amount also the
+ * results of the linking process. <p>
+ * TODO: <ul>
+ * <li> Find a better Name
+ * <li> The API is not optimal. In general the {@link TextAnalyzer} and the
+ * {@link AnalysedContent} interface do not play well together :(
+ * </ul>
+ * @author Rupert Westenthaler
+ *
+ */
+public interface AnalysedContent {
+
+    
+    /**
+     * Getter for the Iterator over the analysed sentences. This Method
+     * is expected to return always the same Iterator instance.
+     * @return the iterator over the analysed sentences
+     */
+    public Iterator<AnalysedText> getAnalysedText();
+    /**
+     * Called to check if a {@link Token} should be used to search for
+     * Concepts within the Taxonomy based on the POS tag of the Token.
+     * @param posTag the POS tag to check
+     * @return <code>true</code> if Tokens with this POS tag should be
+     * included in searches. Otherwise <code>false</code>.  If this information 
+     * is not available (e.g. no set of Tags that need to be processed is defined) 
+     * this Method MUST return <code>null</code>
+     */
+    public Boolean processPOS(String posTag);
+    /**
+     * Called to check if a chunk should be used to search for Concepts.
+     * @param chunkTag the tag (type) of the chunk
+     * @return <code>true</code> if chunks with this tag (type) should be
+     * processed (used to search for matches of concepts) and <code>false</code>
+     * if not. If this information is not available (e.g. no set of Tags that
+     * need to be processed is defined) this Method MUST return <code>null</code>
+     */
+    public Boolean processChunk(String chunkTag);
+    /**
+     * Tokenizes the parsed label
+     * @param label the label to tokenize
+     * @return the spans of the tokens
+     */
+    public String[] tokenize(String label);
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/AnalysedContent.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,375 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.util.Span;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+import org.apache.stanbol.enhancer.engines.keywordextraction.impl.ProcessingState;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion.MATCH;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+
+public class EntityLinker {
+
+    private final EntityLinkerConfig config;
+    private final AnalysedContent content;
+    private final EntitySearcher entitySearcher;
+    /**
+     * The state of the current processing
+     */
+    private final ProcessingState state;
+    /**
+     * The map holding the results of the linking process
+     */
+    private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+    
+    /**
+     * After {@link #process()}ing this returns the entities linked for the
+     * parsed {@link AnalysedContent}.
+     * @return the linked entities
+     */
+    public final Map<String,LinkedEntity> getLinkedEntities() {
+        return linkedEntities;
+    }
+    public EntityLinker(AnalysedContent content,EntitySearcher taxonomy,EntityLinkerConfig config){
+        if(config == null){
+            throw new IllegalArgumentException("The parsed TaxonomyLinkerConfig MUST NOT be NULL!");
+        }
+        if(taxonomy == null){
+            throw new IllegalArgumentException("The parsed Taxonomy MUST NOT be NULL!");
+        }
+        if(content == null){
+            throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT be NULL!");
+        }
+        this.content = content;
+        this.entitySearcher = taxonomy;
+        this.config = config;
+        this.state = new ProcessingState(content.getAnalysedText());
+    }
+    /**
+     * Steps over the sentences, chunks, tokens of the {@link #sentences}
+     */
+    public void process(){
+        while(state.next()) {
+            if(isProcessableToken(state.getToken())){
+                List<String> searchStrings = new ArrayList<String>(config.getMaxSearchTokens());
+                searchStrings.add(state.getToken().getText());
+                //get the list of all tokens that can possible be matched
+                int includeTokenIndex = state.getTokenIndex();
+                includeTokenIndex++;
+                while(searchStrings.size() < config.getMaxSearchTokens() && //more search strings
+                        (includeTokenIndex <= (state.getChunk() != null ? //still within
+                                state.getChunk().getEnd() : //the chunk
+                                    state.getSentence().getTokens().size()-1))){ //or sentence
+                    Token included = state.getSentence().getTokens().get(includeTokenIndex);
+                    includeTokenIndex++;
+                    if(isProcessableToken(included)){
+                        searchStrings.add(included.getText());
+                    }
+                }
+                //search for Entities
+                List<Suggestion> suggestions = lookupEntities(searchStrings);
+                if(!suggestions.isEmpty()){
+                    //update the suggestions based on the best match
+                    int bestMatchCount = suggestions.get(0).getMatchCount();
+                    Iterator<Suggestion> it = suggestions.iterator();
+                    while(it.hasNext()){
+                        Suggestion suggestion = it.next();
+                        //suggestions that match less tokens as the best match
+                        //need to be updated to PARTIAL
+                        if(suggestion.getMatchCount() < bestMatchCount){
+                            suggestion.setMatch(MATCH.PARTIAL);
+                        }
+                        //Filter matches with less than config.getMinFoundTokens()
+                        //if matchcount is less than of the best match
+                        if(suggestion.getMatchCount() < bestMatchCount &&
+                                suggestion.getMatchCount() < config.getMinFoundTokens()){
+                            it.remove();
+                        } else { //calculate the score
+                            //how good is the current match in relation to the best one
+                            double spanScore = ((double)suggestion.getMatchCount())/bestMatchCount;
+                            //how good is the match to the span selected by this suggestion
+                            double textScore = ((double)suggestion.getMatchCount())/suggestion.getSpan();
+                            //how good is the match in relation to the tokens of the suggested label
+                            double labelScore = ((double)suggestion.getMatchCount()/suggestion.getLabelTokenCount());
+                            suggestion.setScore(spanScore*spanScore*textScore*labelScore);
+                        }
+                    }
+                    Suggestion oldBestRanked = suggestions.get(0); //for debugging
+                    //resort by score
+                    Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+                    //this should never happen ... but the
+                    //matchcount of the best match MUST NOT change
+                    //after the sort by score!
+                    if(bestMatchCount != suggestions.get(0).getMatchCount()){
+                        //TODO: change this to a warning (like to have exceptions during debugging)
+                        throw new IllegalStateException(String.format(
+                            "The match count for the top Ranked Suggestion for %s changed after resorting based on Scores! (original: %s, currnet %s)",
+                            state.getTokenText(bestMatchCount),oldBestRanked,suggestions));
+                    }
+                    //remove all suggestions > config.maxSuggestions
+                    if(suggestions.size() > config.getMaxSuggestions()){
+                        suggestions.subList(config.getMaxSuggestions(),suggestions.size()).clear();
+                    }
+                    
+                    //process redirects
+                    if(config.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
+                        for(Suggestion suggestion : suggestions){
+                            processRedirects(suggestion);
+                        }
+                    }
+                    int span = suggestions.get(0).getSpan();
+                    //Store the linking results
+                    String selectedText = state.getTokenText(span);
+                    //float score;
+                    LinkedEntity linkedEntity = linkedEntities.get(selectedText);
+                    if(linkedEntity == null){
+                        linkedEntity = new LinkedEntity(selectedText,
+                            suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+                        linkedEntities.put(selectedText, linkedEntity);
+                    }
+                    linkedEntity.addOccurrence(
+                        state.getSentence(), state.getTokenIndex(), span);
+                    //set the next token to process to the next word after the
+                    //currently found suggestion
+                    state.setNextToken(state.getTokenIndex()+span);
+                }
+                
+            } //else do not process this token
+        }
+    }
+    /**
+     * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
+     * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
+     * values for the {@link LinkedEntity#getTypes()} by using the configured
+     * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
+     * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType() 
+     * default} type.
+     * @param conceptTypes The list of suggestions
+     * @return the types values for the {@link LinkedEntity}
+     */
+    private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
+        Collection<String> conceptTypes = new HashSet<String>();
+        for(Suggestion suggestion : suggestions){
+            for(Iterator<Reference> types = 
+                suggestion.getRepresentation().getReferences(config.getTypeField()); 
+                types.hasNext();conceptTypes.add(types.next().getReference()));
+        }
+        Map<String,UriRef> typeMappings = config.getTypeMappings();
+        Set<UriRef> dcTypes = new HashSet<UriRef>();
+        for(String conceptType : conceptTypes){
+            UriRef dcType = typeMappings.get(conceptType);
+            if(dcType != null){
+                dcTypes.add(dcType);
+            }
+        }
+        if(dcTypes.isEmpty() && config.getDefaultDcType() != null){
+            dcTypes.add(config.getDefaultDcType());
+        }
+        return dcTypes;
+    }
+    /**
+     * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
+     * the parsed suggestions based on the {@link RedirectProcessingMode}
+     * as configured in the {@link #config}.<p>
+     * The results of this method are stored within the parsed {@link Suggestion}s
+     * @param suggestion The suggestion to process.
+     */
+    private void processRedirects(Suggestion suggestion) {
+        //if mode is IGNORE -> nothing to do
+        if(config.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
+            return;
+        }
+        //in case results for queries are locally cached it might be the case
+        //that some/all of the results do already redirects processed.
+        //therefore there is a small internal state that stores this information
+        if(suggestion.isRedirectedProcessed()){
+            return; //Redirects for ResultMatch are already processed ... ignore
+        }
+        Representation result = suggestion.getResult();
+        Iterator<Reference> redirects = result.getReferences(config.getRedirectField());
+        switch (config.getRedirectProcessingMode()) {
+            case ADD_VALUES:
+                while(redirects.hasNext()){
+                    Reference redirect = redirects.next();
+                    if(redirect != null){
+                        Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+                            config.getSelectedFields());
+                        if(redirectedEntity != null){
+                            for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){
+                                String field = fields.next();
+                                result.add(field, redirectedEntity.get(field));
+                            }
+                        }
+                        //set that the redirects where searched for this result
+                        suggestion.setRedirectProcessed(true);
+                    }
+                }
+            case FOLLOW:
+                while(redirects.hasNext()){
+                    Reference redirect = redirects.next();
+                    if(redirect != null){
+                        Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+                            config.getSelectedFields());
+                        if(redirectedEntity != null){
+                            //copy the original result score
+                            redirectedEntity.set(RdfResourceEnum.resultScore.getUri(),
+                                result.get(RdfResourceEnum.resultScore.getUri()));
+                            //set the redirect
+                            suggestion.setRedirect(redirectedEntity);
+                        }
+                    }
+                }
+            default: //nothing to do
+        }
+    }
+    /**
+     * Searches for Entities in the {@link #entitySearcher} corresponding to the
+     * {@link Token#getText() words} of the current {@link #state position} in
+     * the text.
+     * @param searchStrings the list of {@link Token#getText() words} to search
+     * entities for.
+     * @return The sorted list with the suggestions.
+     * If there are no suggestions an empty list will be returned.
+     */
+    private List<Suggestion> lookupEntities(List<String> searchStrings) {
+        Collection<? extends Representation> results = entitySearcher.lookup(
+            config.getNameField(),config.getSelectedFields(),
+            searchStrings, state.getSentence().getLanguage());
+        List<Suggestion> suggestions = new ArrayList<Suggestion>();
+        for(Representation result : results){
+            Suggestion match = matchLabels(result);
+            if(match.getMatch() != MATCH.NONE){
+                suggestions.add(match);
+            }                    
+        }
+        //sort the suggestions
+        if(suggestions.size()>1){
+            Collections.sort(suggestions,Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
+        }
+        //remove all elements > config.getMaxSuggestions()
+        return suggestions;
+    }
+    /**
+     * Matches the labels of the parsed {@link Representation} with the Tokens of
+     * the texts (beginning with the currently active 
+     * {@link ProcessingState#getToken() token}).<p>
+     * The field used to get the labels is retrieved from 
+     * {@link EntitySearcher#getNameField()}. Only labels with no language or the
+     * language of the current sentence are considered. If less than 
+     * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
+     * label the Concept is only considered to match if the label is
+     * {@link String#equalsIgnoreCase(String)} to the text covered by the
+     * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
+     * results are allowed.
+     * @param rep The representation including at least the data for the
+     * {@link EntitySearcher#getNameField()} property.
+     * @return The result of the matching.
+     */
+    private Suggestion matchLabels(Representation rep) {
+        Iterator<Text> labels = rep.getText(config.getNameField());
+        Suggestion match = new Suggestion(rep);
+        while(labels.hasNext()){
+            Text label = labels.next();
+            //NOTE: I use here startWith language because I want 'en-GB' labels accepted for 'en'
+            if(label.getLanguage() == null || label.getLanguage().startsWith(
+                    state.getSentence().getLanguage())){
+                String text = label.getText().toLowerCase();
+                List<String> labelTokens = Arrays.asList(content.tokenize(text));
+                int foundTokens = 0;
+                //ensure the correct order of the tokens in the suggested entity
+                int foundInLabelIndex = 0;
+                boolean search = true;
+                int lastFoundIndex = -1;
+                Token currentToken;
+                int maxNotFound = 1; //TODO make configureable
+                int notFound = 0;
+                for(int currentIndex = state.getTokenIndex();currentIndex < state.getSentence().getTokens().size() && search;currentIndex++){
+                    currentToken = state.getSentence().getTokens().get(currentIndex);
+                    boolean isProcessable = isProcessableToken(currentToken);
+                    int found = text.indexOf(currentToken.getText().toLowerCase());
+                    if(found>=foundInLabelIndex){ //found
+                        if(isProcessable){
+                            foundTokens++; //only count processable Tokens
+                        }
+                        foundInLabelIndex = found+currentToken.getText().length();
+                        lastFoundIndex = currentIndex;
+                    } else { //not found
+                        notFound++;
+                        if(isProcessable || notFound > maxNotFound){
+                            //stop as soon as a token that needs to be processed is
+                            //not found in the label or the maximum number of tokens
+                            //that are not processable are not found
+                            search = false; 
+                        }
+                    } //else it is OK if non processable tokens are not found
+                } 
+                MATCH labelMatch; 
+                int coveredTokens = lastFoundIndex-state.getTokenIndex()+1;
+                //Matching rules
+                // - if less than config#minTokenFound() than accept only EXACT
+                // - override PARTIAL matches with FULL/EXACT matches only if
+                //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
+                //   match (this will be very rare
+                if(foundTokens > 0 && match.getMatchCount() <= foundTokens) {
+                    String currentText = state.getTokenText(coveredTokens);
+                    if(currentText.equalsIgnoreCase(label.getText())){ 
+                        labelMatch = MATCH.EXACT;
+                        //set found to covered: May be lower because only
+                        //processable tokens are counted, but Exact also checks
+                        //of non-processable!
+                        foundTokens = coveredTokens;
+                    } else if(foundTokens >= config.getMinFoundTokens()){
+                        if(foundTokens == coveredTokens){
+                            labelMatch = MATCH.FULL;
+                        } else {
+                            labelMatch = MATCH.PARTIAL;
+                        }
+                    } else {
+                        labelMatch = MATCH.NONE;
+                    }
+                    if(labelMatch != MATCH.NONE){
+                        if(match.getMatchCount() < foundTokens ||
+                                match.getMatchCount() < foundTokens && 
+                                labelMatch.ordinal() > match.getMatch().ordinal()){
+                            match.updateMatch(labelMatch, coveredTokens, foundTokens,label,labelTokens.size());
+                        } //else this match is not better as the existing one
+                    } //else ignore labels with MATCH.NONE
+                } //else NO tokens found -> nothing to do
+            } // else worng language
+        }
+        return match;
+    }
+
+    /**
+     * Checks if the current token of {@link #state} is processable. 
+     * @param token the {@link Token} to check.
+     * @return <code>true</code> if the parsed token needs to be processed.
+     * Otherwise <code>false</code>
+     */
+    private boolean isProcessableToken(Token token) {
+        Boolean processToken = null;
+        if(token.getPosTag() != null){
+            processToken = content.processPOS(token.getPosTag());
+        }
+        if(processToken == null) {
+             processToken = token.getText().length() >= config.getMinSearchTokenLength();
+        }
+        return processToken;
+    }
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,399 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.postag.POSTagger;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk;
+import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
+import org.apache.stanbol.enhancer.engines.keywordextraction.linking.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+
+/**
+ * The configuration for the {@link EntityLinker}. Typically this
+ * configuration does not change often. Therefore it will be used for
+ * several {@link EntityLinker} instances processing different 
+ * contents.
+ * @author Rupert Westenthaler
+ *
+ */
+public class EntityLinkerConfig {
+    /**
+     * The minimum length of Token to be used for searches in case no
+     * POS (Part of Speech) tags are available.
+     */
+    public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3;
+    /**
+     * The default number for the maximum number of terms suggested for a word
+     */
+    public static final int DEFAULT_SUGGESTIONS = 3;
+    /**
+     * Default value for the number of tokens that must be contained in
+     * suggested terms.
+     */
+    public static final int DEFAULT_MIN_FOUND_TOKENS = 2;
+    /**
+     * Multiple Tokens can be sent to the {@link EntitySearcher} service. The
+     * service uses this as optional parameters for the search. Therefore
+     * returned Concepts MUST contain at least a single of the parsed 
+     * tokens. <p>
+     * The default value of <code>2</code> should be enough for nearly all
+     * Taxonomies to sufficiently reduce the number of results.<p>
+     * NOTE that the labels (nameField) of the results are compared as a
+     * whole. So even if only 2 Tokens are used for the search there may be
+     * more mapped to the actual label of an result.
+     */
+    public static final int DEFAULT_MAX_SEARCH_TOKENS = 2;
+
+    /**
+     * Default value for {@link #getNameField()} (rdfs:label)
+     */
+    public static final String DEFAULT_NAME_FIELD = "rdfs:label";
+    /**
+     * Default value for {@link #getTypeField()} (rdf:type)
+     */
+    public static final String DEFAULT_TYPE_FIELD = "rdf:type";
+    /**
+     * Default value for {@link #getRedirectField()} (rdf:seeAlso)
+     */
+    public static final String DEFAULT_REDIRECT_FIELD = "rdfs:seeAlso";
+    /**
+     * Default mapping for Concept types to dc:type values added for
+     * TextAnnotations.
+     */
+    public static final Map<String,UriRef> DEFAULT_ENTITY_TYPE_MAPPINGS;
+    
+    static { //the default mappings for the three types used by the Stanbol Enhancement Structure
+        Map<String,UriRef> mappings = new HashMap<String,UriRef>();
+        mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
+        mappings.put(NamespaceEnum.dbpediaOnt+"Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
+        mappings.put(NamespaceEnum.schema+"Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+        
+        mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(), OntologicalClasses.DBPEDIA_PERSON);
+        mappings.put(NamespaceEnum.foaf+"Person", OntologicalClasses.DBPEDIA_PERSON);
+        mappings.put(NamespaceEnum.schema+"Person", OntologicalClasses.DBPEDIA_PERSON);
+
+        mappings.put(OntologicalClasses.DBPEDIA_PLACE.getUnicodeString(), OntologicalClasses.DBPEDIA_PLACE);
+        mappings.put(NamespaceEnum.schema+"Place", OntologicalClasses.DBPEDIA_PLACE);
+
+        mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(), OntologicalClasses.SKOS_CONCEPT);
+        DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
+    }
+    /**
+     * Enumeration over the different possibilities on how to deal with
+     * redirects (similar to Browsers following HTTP status 303 and RDF defining
+     * the "rdf:seeAlso" relation. 
+     * @author Rupert Westenthaler
+     */
+    public static enum RedirectProcessingMode {
+        /**
+         * Ignore redirects
+         */
+        IGNORE,
+        /**
+         * Follow redirects, but only add the values (e.g. labels, types) such
+         * entities to the original one.
+         */
+        ADD_VALUES,
+        /**
+         * Follow the redirect.
+         */
+        FOLLOW
+    }
+    /**
+     * The default value for how to process redirect is set to
+     * {@link RedirectProcessingMode#IGNORE}
+     */
+    public static RedirectProcessingMode DEFAULT_REDIRECT_PROCESSING_MODE = 
+        RedirectProcessingMode.IGNORE;
+    /**
+     * The minimum length of labels that are looked-up in the directory
+     */
+    private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
+    /**
+     * The the maximum number of terms suggested for a word
+     */
+    private int maxSuggestions = DEFAULT_SUGGESTIONS;
+    /**
+     * If several words are selected from the text to search for an Entity in the
+     * Dictionary (e.g. if a {@link Chunker} is used or if the {@link POSTagger}
+     * detects several connected nouns) that entities found for the such chunks
+     * MUST define a label (with no or the correct lanugage) that contains at
+     * least this number of tokens to be accepted.<p>
+     * TODO: make configurable
+     */
+    private int minFoundTokens = DEFAULT_MIN_FOUND_TOKENS;
+    /**
+     * The maximum numbers of Tokens sent to the {@link EntitySearcher} to search
+     * for concepts. <p>
+     * NOTE that the labels (nameField) of the results are compared as a
+     * whole. So even if only e.g. 2 tokens are used for the search there may be
+     * more mapped to the actual label of an result.
+     */
+    private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
+    /**
+     * Holds the mappings of rdf:type used by concepts to dc:type values used
+     * by TextAnnotations. 
+     */
+    private Map<String,UriRef> typeMappings;
+    private Map<String, UriRef> unmodTypeMappings;
+    /**
+     * The mode on how to process redirect for Entities. 
+     */
+    private RedirectProcessingMode redirectProcessingMode;
+    /**
+     * the default DC Type
+     */
+    private UriRef defaultDcType;
+    private String nameField;
+    private String redirectField;
+    private String typeField;
+    private Set<String> selectedFields = new HashSet<String>();
+    /**
+     * Default constructor the initialises the configuration with the 
+     * default values
+     */
+    public EntityLinkerConfig(){
+        setMinSearchTokenLength(DEFAULT_MIN_SEARCH_TOKEN_LENGTH);
+        setMaxSuggestions(DEFAULT_SUGGESTIONS);
+        setMaxSearchTokens(DEFAULT_MAX_SEARCH_TOKENS);
+        setRedirectProcessingMode(DEFAULT_REDIRECT_PROCESSING_MODE);
+        typeMappings = new HashMap<String,UriRef>(DEFAULT_ENTITY_TYPE_MAPPINGS);
+        unmodTypeMappings = Collections.unmodifiableMap(typeMappings);
+        setDefaultDcType(typeMappings.remove(null));
+        setNameField(DEFAULT_NAME_FIELD);
+        setRedirectField(DEFAULT_REDIRECT_FIELD);
+        setTypeField(DEFAULT_TYPE_FIELD);
+    }
+    /**
+     * Getter for the uri of the field used for the names in the taxonomy
+     * (e.g. rdfs:label, skos:prefLabel). Needs to return the full URI
+     * @return the field used for the names of in the Taxonomy.
+     */
+    public final String getNameField() {
+        return nameField;
+    }
+    /**
+     * Setter for the uri of the field used for the names in the taxonomy
+     * (e.g. rdfs:label, skos:prefLabel). 
+     * Converts short to full URIy by using the prefixes as registered in the
+     * {@link NamespaceEnum}.
+     * @param nameField the nameField to set
+     */
+    public final void setNameField(String nameField) {
+        this.nameField = NamespaceEnum.getFullName(nameField);
+        updateSelectedFields();
+    }
+    /**
+     * internally used to update the selected fields on changes to
+     * {@link #setNameField(String)}, {@link #setRedirectField(String)} or
+     * {@link #setTypeField(String)}
+     */
+    private void updateSelectedFields() {
+        selectedFields.clear();
+        selectedFields.add(nameField);
+        selectedFields.add(redirectField);
+        selectedFields.add(typeField);
+    }
+    /**
+     * Getter for the selected fields. A set that includes the current
+     * {@link #getNameField()}, {@link #getTypeField()} and {@link #getRedirectField()}.
+     * @return the selectedFields
+     */
+    public final Set<String> getSelectedFields() {
+        return selectedFields;
+    }
+    /**
+     * The field used to follow redirects (typically rdf:seeAlso)
+     * @return the redirect field
+     */
+    public final String getRedirectField() {
+        return redirectField;
+    }
+    /**
+     * The field used to follow redirects (typically rdf:seeAlso)
+     * Converts short to full URIy by using the prefixes as registered in the
+     * {@link NamespaceEnum}.
+     * @param redirectField the redirectField to set
+     */
+    public final void setRedirectField(String redirectField) {
+        this.redirectField = NamespaceEnum.getFullName(redirectField);
+        updateSelectedFields();
+    }
+    /**
+     * The field used to lookup the types (typically rdf:type)
+     * @return the field name used to lookup types
+     */
+    public final String getTypeField() {
+        return typeField;
+    }
+    /**
+     * The field used to lookup the types (typically rdf:type)
+     * Converts short to full URIy by using the prefixes as registered in the
+     * {@link NamespaceEnum}.
+     * @param typeField the typeField to set
+     */
+    public final void setTypeField(String typeField) {
+        this.typeField = NamespaceEnum.getFullName(typeField);
+        updateSelectedFields();
+    }
+    /**
+     * The minimum number of character a {@link Token} (word) must have to be
+     * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
+     * in the taxonomy. Note that this parameter is only used of no POS (Part-
+     * of-speech) tags are available in the {@link AnalysedText}.
+     * @param minSearchTokenLength the minSearchTokenLength to set
+     */
+    public void setMinSearchTokenLength(int minSearchTokenLength) {
+        this.minSearchTokenLength = minSearchTokenLength;
+    }
+    /**
+     * The minimum number of character a {@link Token} (word) must have to be
+     * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
+     * in the taxonomy. Note that this parameter is only used of no POS (Part-
+     * of-speech) tags are available in the {@link AnalysedText}.
+     * @return the minSearchTokenLength
+     */
+    public int getMinSearchTokenLength() {
+        return minSearchTokenLength;
+    }
+    /**
+     * Setter for the maximum number of suggestion returned. 
+     * @param maxSuggestions the maxSuggestions to set
+     */
+    public void setMaxSuggestions(int maxSuggestions) {
+        this.maxSuggestions = maxSuggestions;
+    }
+    /**
+     * Getter for the maximum number of suggestion returned. 
+     * @return the maxSuggestions
+     */
+    public int getMaxSuggestions() {
+        return maxSuggestions;
+    }
+    /**
+     * Setter for the minimum number of Tokens (of the content) that MUST match
+     * with a {@link EntitySearcher#getNameField() label} of a 
+     * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
+     * so that it is {@link Suggestion suggested} even if the match is only
+     * {@link MATCH#PARTIAL}. Entities that match less than that are only included
+     * if a label is an {@link MATCH#EXACT EXACT} match with the current position
+     * in the text. 
+     * @param minFoundTokens the minFoundTokens to set
+     */
+    public void setMinFoundTokens(int minFoundTokens) {
+        this.minFoundTokens = minFoundTokens;
+    }
+    /**
+     * Getter for the minimum number of Tokens (of the content) that MUST match
+     * with a {@link EntitySearcher#getNameField() label} of a 
+     * {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
+     * so that it is {@link Suggestion suggested} even if the match is only
+     * {@link MATCH#PARTIAL}. Entities that match less than that are only included
+     * if a label is an {@link MATCH#EXACT EXACT} match with the current position
+     * in the text. 
+     * @return the minFoundTokens
+     */
+    public int getMinFoundTokens() {
+        return minFoundTokens;
+    }
+    /**
+     * Getter for the  maximum number of tokens parsed to 
+     * {@link EntitySearcher#lookup(java.util.List, String...)}
+     * @return the maxSearchTokens
+     */
+    public final int getMaxSearchTokens() {
+        return maxSearchTokens;
+    }
+    /**
+     * The maximum number of tokens parsed to 
+     * {@link EntitySearcher#lookup(java.util.List, String...)}. This is NOT the
+     * maximum number of Tokens mapped for Entities returned by such queries.<p>
+     * In case {@link Chunk}s are available in the parsed {@link AnalysedText}
+     * searches can be scoped by such chunks. However if no chunks are available,
+     * than this value is used to collect this number of words in the text.<p>
+     * The {@link #DEFAULT_MAX_SEARCH_TOKENS default value} of <code>2</code>
+     * should be ok in most cases.  
+     * @param maxSearchTokens the maxSearchTokens to set
+     */
+    public final void setMaxSearchTokens(int maxSearchTokens) {
+        this.maxSearchTokens = maxSearchTokens;
+    }
+    /**
+     * Removes the mapping for the parsed concept type
+     * @param conceptType the concept type to remove the mapping
+     * @return the previously mapped dc:type value or <code>null</code> if
+     * no mapping for the parsed concept type was present
+     */
+    public UriRef removeTypeMapping(String conceptType){
+        return typeMappings.remove(conceptType);
+    }
+    /**
+     * 
+     * @param conceptType the type of the concept or <code>null</code> to
+     * add the default dc:type mapping. See also {@link #setDefaultDcType(UriRef)}
+     * @param dcType the dc:type for the parsed concept type
+     * @return the previously mapped dc:type value if an existing mapping
+     * was updated or <code>null</code> if a new mapping was added.
+     */
+    public UriRef setTypeMapping(String conceptType, UriRef dcType){
+        if(dcType == null) {
+            throw new IllegalArgumentException("The parsed dc:type URI MUST NOT be NULL!");
+        }
+        if(conceptType == null){ //handle setting of the default dc:type value
+            UriRef oldDefault = getDefaultDcType();
+            setDefaultDcType(dcType);
+            return oldDefault;
+        }
+        return typeMappings.put(conceptType, dcType);
+    }
+    
+    /**
+     * Setter for the default dc:type of linked entities if for none of the
+     * types of the suggestions a {@link #getTypeMappings()} exists. Set this
+     * to <code>null</code> to specify that no dc:type should be set in such
+     * cases.
+     * @param defaultDcType the defaultDcType to set
+     */
+    public void setDefaultDcType(UriRef defaultDcType) {
+        this.defaultDcType = defaultDcType;
+    }
+    /**
+     * The default type for Entities if no {@link #getTypeMappings() type mapping}
+     * is present. <code>null</code> means that no type should be set if no
+     * explicit mapping exists
+     * @return the defaultDcType
+     */
+    public UriRef getDefaultDcType() {
+        return defaultDcType;
+    }
+    /**
+     * Setter for the mode on how to deal with redirects
+     * @param redirectProcessingMode the redirectProcessingMode to set
+     */
+    public void setRedirectProcessingMode(RedirectProcessingMode redirectProcessingMode) {
+        this.redirectProcessingMode = redirectProcessingMode;
+    }
+    /**
+     * Getter for the mode how to deal with redirects
+     * @return the redirectProcessingMode
+     */
+    public RedirectProcessingMode getRedirectProcessingMode() {
+        return redirectProcessingMode;
+    }    
+    /**
+     * Getter for the read only mappings of type mappings
+     * @return the type mappings (read only)
+     */
+    public Map<String,UriRef> getTypeMappings() {
+        return unmodTypeMappings;
+    }
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinkerConfig.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1173968&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Thu Sep 22 06:51:30 2011
@@ -0,0 +1,51 @@
+package org.apache.stanbol.enhancer.engines.keywordextraction.linking;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.stanbol.entityhub.servicesapi.Entityhub;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.query.TextConstraint;
+import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
+
+/**
+ * Interface used to search for Entities (e.g. as defined by a Controlled
+ * Vocabulary) Different implementations of this interface allow to use 
+ * different sources. Typically the {@link Entityhub} or a {@link ReferencedSite}
+ * will be used as source, but in some cases one might also use in-memory
+ * implementation.
+ * @author Rupert Westenthaler
+ */
+public interface EntitySearcher {
+    /**
+     * Lookup Concepts for the parsed strings. Parameters follow the same
+     * rules as  {@link TextConstraint#TextConstraint(List, String...)}
+     * @param field the field used to search for values in the parsed languages
+     * @param includeFields A set of fields that need to be included within the 
+     * returned {@link Representation}. The parsed field needs also to be included
+     * even if missing in this set. If <code>null</code> only the field needs
+     * to be included. Other fields MAY also be included.
+     * @param search the tokens to search for. MUST NOT be <code>null</code>
+     * @param languages the languages to include in the search 
+     * @return the Representations found for the specified query
+     * @throws T An exception while searching for concepts
+     */
+    Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException;
+    /**
+     * Lookup a concept of the taxonomy by the id.
+     * @param id the id
+     * @param includeFields A set of fields that need to be included within the 
+     * returned {@link Representation}. Other fields MAY be also included.
+     * @return the concept or <code>null</code> if not found
+     */
+    Representation get(String id,Set<String> includeFields) throws IllegalStateException;
+    /**
+     * Returns <code>true</code> if this EntitySearcher can operate without
+     * dependencies to remote services. This is important because Stanbol can
+     * be forced to run in offline-mode.
+     * @return the state
+     */
+    boolean supportsOfflineMode();
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain