You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:40:11 UTC
svn commit: r1413155 [3/4] - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking: ./ src/ src/license/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/stanbol/ src/main/java/org/apac...

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,693 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class EntityLinker {
+    
+    private final Logger log = LoggerFactory.getLogger(EntityLinker.class);
+    
+    private final EntityLinkerConfig linkerConfig;
+    private final LanguageProcessingConfig textProcessingConfig;
+    //private final AnalysedText analysedText;
+    private final EntitySearcher entitySearcher;
+    /**
+     * The state of the current processing
+     */
+    private final ProcessingState state;
+    /**
+     * The map holding the results of the linking process
+     */
+    private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+    
+    private Integer lookupLimit;
+    
+    private LabelTokenizer labelTokenizer;
+    
+
+    public EntityLinker(AnalysedText analysedText, String language,
+                        LanguageProcessingConfig textProcessingConfig,
+                        EntitySearcher entitySearcher,
+                        EntityLinkerConfig linkerConfig,
+                        LabelTokenizer labelTokenizer) {
+        //this.analysedText = analysedText;
+        this.entitySearcher = entitySearcher;
+        this.linkerConfig = linkerConfig;
+        this.textProcessingConfig = textProcessingConfig;
+        this.labelTokenizer = labelTokenizer;
+        this.state = new ProcessingState(analysedText,language,textProcessingConfig,linkerConfig);
+        this.lookupLimit  = Math.max(10,linkerConfig.getMaxSuggestions()*2);
+    }
+    /**
+     * Steps over the sentences, chunks, tokens of the {@link #sentences}
+     */
+    public void process() throws EngineException {
+        //int debugedIndex = 0;
+        while(state.next()) {
+            TokenData token = state.getToken();
+            if(log.isDebugEnabled()){
+                log.debug("--- preocess Token {}: {} (lemma: {} | pos:{}) chunk: {}",
+                    new Object[]{token.index,token.token.getSpan(),
+                                 token.morpho != null ? token.morpho.getLemma() : "none", 
+                                 token.token.getAnnotations(POS_ANNOTATION),
+                                 token.inChunk != null ? 
+                                         (token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) : 
+                                             "none"});
+            }
+            List<String> searchStrings = new ArrayList<String>(linkerConfig.getMaxSearchTokens());
+            searchStrings.add(token.getTokenText());
+            //Determine the range we are allowed to search for tokens
+            final int minIncludeIndex;
+            int maxIndcludeIndex;
+            if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
+                minIncludeIndex = Math.max(
+                    state.getConsumedIndex()+1, 
+                    token.inChunk.startToken);
+                maxIndcludeIndex = token.inChunk.endToken;
+            } else {
+                maxIndcludeIndex = state.getTokens().size() - 1;
+                minIncludeIndex = state.getConsumedIndex() + 1;
+            }
+            int prevIndex,pastIndex; //search away from the currently active token
+            int distance = 0;
+            do {
+                distance++;
+                prevIndex = token.index-distance;
+                pastIndex = token.index+distance;
+                if(minIncludeIndex <= prevIndex){
+                    TokenData prevToken = state.getTokens().get(prevIndex);
+                    if(log.isDebugEnabled()){
+                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
+                            prevToken.isMatchable? '+':'-',prevToken.index,
+                            prevToken.token.getSpan(),
+                            prevToken.morpho != null ? prevToken.morpho.getLemma() : "none",
+                            prevToken.token.getAnnotations(POS_ANNOTATION)
+                        });
+                    }
+                    if(prevToken.isMatchable){
+                        searchStrings.add(0,prevToken.getTokenText());
+                    }
+                }
+                if(maxIndcludeIndex >= pastIndex){
+                    TokenData pastToken = state.getTokens().get(pastIndex);
+                    if(log.isDebugEnabled()){
+                        log.debug("    {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
+                            pastToken.isMatchable? '+':'-',pastToken.index,
+                            pastToken.token.getSpan(),
+                            pastToken.morpho != null ? pastToken.morpho.getLemma() : "none",
+                            pastToken.token.getAnnotations(POS_ANNOTATION)
+                        });
+                    }
+                    if(pastToken.isMatchable){
+                        searchStrings.add(pastToken.getTokenText());
+                    }
+                }
+            } while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
+                    linkerConfig.getMaxSearchDistance() &&
+                    (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex));
+            //we might have an additional element in the list
+            if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
+                searchStrings = searchStrings.subList(0, linkerConfig.getMaxSearchTokens());
+            }
+            log.debug("  >> searchStrings {}",searchStrings);
+            //search for Entities
+            List<Suggestion> suggestions = lookupEntities(searchStrings);
+            if(!suggestions.isEmpty()){
+                //update the suggestions based on the best match
+                int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
+                Iterator<Suggestion> it = suggestions.iterator();
+                while(it.hasNext()){
+                    Suggestion suggestion = it.next();
+                    //suggestions that match less tokens as the best match
+                    //need to be updated to PARTIAL
+                    int matchCount = suggestion.getLabelMatch().getMatchCount();
+                    if(matchCount < bestMatchCount){
+                        suggestion.setMatch(MATCH.PARTIAL);
+                    }
+                    //Filter matches with less than config.getMinFoundTokens()
+                    //if matchcount is less than of the best match
+                    if(matchCount < bestMatchCount &&
+                            matchCount < linkerConfig.getMinFoundTokens()){
+                        it.remove();
+                    } else { //calculate the score
+                        //how good is the current match in relation to the best one
+                        double spanScore = matchCount/bestMatchCount;
+                        suggestion.setScore(spanScore*spanScore*suggestion.getLabelMatch().getMatchScore());
+                    }
+                }
+                Suggestion oldBestRanked = suggestions.get(0); //for debugging
+                //resort by score
+                Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+                //this should never happen ... but the
+                //matchcount of the best match MUST NOT change
+                //after the sort by score!
+                if(bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()){
+                    log.warn("The match count for the top Ranked Suggestion for {} " +
+                    		"changed after resorting based on Scores!",
+                        state.getTokenText(suggestions.get(0).getLabelMatch().getStart(),bestMatchCount));
+                    log.warn("  originalbest   : {}",oldBestRanked);
+                    log.warn(" currnet ranking : {}",suggestions);
+                    log.warn("  ... this will result in worng confidence values relative to the best match");
+                }
+                //remove all suggestions > config.maxSuggestions
+                if(suggestions.size() > linkerConfig.getMaxSuggestions()){
+                    suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
+                }
+                if(log.isDebugEnabled()){
+                    log.debug("  >> Suggestions:");
+                    int i=0;
+                    for(Suggestion s : suggestions){
+                        log.debug("   - {}: {}",i,s);
+                        i++;
+                    }
+                }
+                //process redirects
+                if(linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
+                    for(Suggestion suggestion : suggestions){
+                        processRedirects(suggestion);
+                    }
+                }
+                int start = suggestions.get(0).getLabelMatch().getStart();
+                int span = suggestions.get(0).getLabelMatch().getSpan();
+                //Store the linking results
+                String selectedText = state.getTokenText(start,span);
+                //float score;
+                LinkedEntity linkedEntity = linkedEntities.get(selectedText);
+                if(linkedEntity == null){
+                    linkedEntity = new LinkedEntity(selectedText,
+                        suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+                    linkedEntities.put(selectedText, linkedEntity);
+                }
+                linkedEntity.addOccurrence(state.getSentence(), 
+                    //NOTE: The end Token is "start+span-1"
+                    state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
+                //set the next token to process to the next word after the
+                //currently found suggestion
+                state.setConsumed(start+span-1);
+            }
+            
+        }
+    }
+    /**
+     * After {@link #process()}ing this returns the entities linked for the
+     * parsed {@link AnalysedContent}.
+     * @return the linked entities
+     */
+    public final Map<String,LinkedEntity> getLinkedEntities() {
+        return linkedEntities;
+    }
+    /**
+     * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
+     * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
+     * values for the {@link LinkedEntity#getTypes()} by using the configured
+     * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
+     * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType() 
+     * default} type.
+     * @param conceptTypes The list of suggestions
+     * @return the types values for the {@link LinkedEntity}
+     */
+    private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
+        Collection<String> conceptTypes = new HashSet<String>();
+        for(Suggestion suggestion : suggestions){
+            for(Iterator<Reference> types = 
+                suggestion.getRepresentation().getReferences(linkerConfig.getTypeField()); 
+                types.hasNext();conceptTypes.add(types.next().getReference()));
+        }
+        Map<String,UriRef> typeMappings = linkerConfig.getTypeMappings();
+        Set<UriRef> dcTypes = new HashSet<UriRef>();
+        for(String conceptType : conceptTypes){
+            UriRef dcType = typeMappings.get(conceptType);
+            if(dcType != null){
+                dcTypes.add(dcType);
+            }
+        }
+        if(dcTypes.isEmpty() && linkerConfig.getDefaultDcType() != null){
+            dcTypes.add(linkerConfig.getDefaultDcType());
+        }
+        return dcTypes;
+    }
+    /**
+     * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
+     * the parsed suggestions based on the {@link RedirectProcessingMode}
+     * as configured in the {@link #config}.<p>
+     * The results of this method are stored within the parsed {@link Suggestion}s
+     * @param suggestion The suggestion to process.
+     */
+    private void processRedirects(Suggestion suggestion) {
+        //if mode is IGNORE -> nothing to do
+        if(linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
+            return;
+        }
+        //in case results for queries are locally cached it might be the case
+        //that some/all of the results do already redirects processed.
+        //therefore there is a small internal state that stores this information
+        if(suggestion.isRedirectedProcessed()){
+            return; //Redirects for ResultMatch are already processed ... ignore
+        }
+        Representation result = suggestion.getResult();
+        Iterator<Reference> redirects = result.getReferences(linkerConfig.getRedirectField());
+        switch (linkerConfig.getRedirectProcessingMode()) {
+            case ADD_VALUES:
+                while(redirects.hasNext()){
+                    Reference redirect = redirects.next();
+                    if(redirect != null){
+                        Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+                            linkerConfig.getSelectedFields());
+                        if(redirectedEntity != null){
+                            for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){
+                                String field = fields.next();
+                                result.add(field, redirectedEntity.get(field));
+                            }
+                        }
+                        //set that the redirects where searched for this result
+                        suggestion.setRedirectProcessed(true);
+                    }
+                }
+            case FOLLOW:
+                while(redirects.hasNext()){
+                    Reference redirect = redirects.next();
+                    if(redirect != null){
+                        Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+                            linkerConfig.getSelectedFields());
+                        if(redirectedEntity != null){
+                            //copy the original result score
+                            redirectedEntity.set(RdfResourceEnum.resultScore.getUri(),
+                                result.get(RdfResourceEnum.resultScore.getUri()));
+                            //set the redirect
+                            suggestion.setRedirect(redirectedEntity);
+                        }
+                    }
+                }
+            default: //nothing to do
+        }
+    }
+    /**
+     * Searches for Entities in the {@link #entitySearcher} corresponding to the
+     * {@link Token#getText() words} of the current {@link #state position} in
+     * the text.
+     * @param searchStrings the list of {@link Token#getText() words} to search
+     * entities for.
+     * @return The sorted list with the suggestions.
+     * If there are no suggestions an empty list will be returned.
+     */
+    private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
+        Collection<? extends Representation> results;
+        try {
+            results = entitySearcher.lookup(linkerConfig.getNameField(),
+                linkerConfig.getSelectedFields(),
+                searchStrings, 
+                new String[]{state.getLanguage(),linkerConfig.getDefaultLanguage()},
+                lookupLimit);
+        } catch (RuntimeException e) {
+            throw new EngineException(e.getMessage(),e);
+        }
+        log.debug("   - found {} entities ...",results.size());
+        List<Suggestion> suggestions = new ArrayList<Suggestion>();
+        for(Representation result : results){ 
+            log.debug("    > {}",result.getId());
+            Suggestion suggestion = matchLabels(result);
+            log.debug("      < {}",suggestion);
+            if(suggestion.getMatch() != MATCH.NONE){
+                suggestions.add(suggestion);
+            }                    
+        }
+        //sort the suggestions
+        if(suggestions.size()>1){
+            Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
+        }
+        //TODO: Work in Progress feature ... allowing to refine search if no
+        //      suggestion is found but results where present
+        //      However this would need full limit/offset support for the
+        //      EntitySearcher. (rwesten 2012-05-21)
+//        Integer maxResults = entitySearcher.getLimit();
+//        if(maxResults == null){
+//            maxResults = 1; //fall back to 1 if limit is not known
+//        }
+//        if(suggestions.isEmpty() && //if no suggestions where found
+//                results.size() >= maxResults && //but the query had max results
+//                //than the actual entity might not be within the first LIMIT results
+//                searchStrings.size() > 1){ //if multiple words where used for the search
+//            //try again with only a single word
+//            suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0)));
+//            
+//        }
+        //remove all elements > config.getMaxSuggestions()
+        return suggestions;
+    }
+    /**
+     * Matches the labels of the parsed {@link Representation} with the Tokens of
+     * the texts (beginning with the currently active 
+     * {@link ProcessingState#getToken() token}).<p>
+     * The field used to get the labels is retrieved from 
+     * {@link EntitySearcher#getNameField()}. Only labels with no language or the
+     * language of the current sentence are considered. If less than 
+     * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
+     * label the Concept is only considered to match if the label is
+     * {@link String#equalsIgnoreCase(String)} to the text covered by the
+     * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
+     * results are allowed.
+     * @param rep The representation including at least the data for the
+     * {@link EntitySearcher#getNameField()} property.
+     * @return The result of the matching.
+     */
+    private Suggestion matchLabels(Representation rep) {
+        String curLang = state.getLanguage(); //language of the current sentence
+        String defLang = linkerConfig.getDefaultLanguage(); //configured default language 
+//        Iterator<Text> labels = rep.get(config.getNameField(), //get all labels
+//            state.getLanguage(), //in the current language
+//            config.getDefaultLanguage()); //and the default language
+        Iterator<Text> labels = rep.getText(linkerConfig.getNameField());
+        Suggestion match = new Suggestion(rep);
+        Collection<Text> defaultLabels = new ArrayList<Text>();
+        boolean matchedCurLangLabel = false;
+        while(labels.hasNext()){
+            Text label = labels.next();
+            String lang = label.getLanguage();
+            if((lang == null && curLang == null) ||
+                    (lang != null && curLang != null && lang.startsWith(curLang))){
+                matchLabel(match, label);
+                matchedCurLangLabel = true;
+            } else if((lang ==null && defLang == null) ||
+                    (lang != null && defLang != null && lang.startsWith(defLang))){
+                defaultLabels.add(label);
+            }
+        }
+        //use only labels in the default language if there is
+        // * no label in the current language or
+        // * no MATCH was found in the current language
+        if(!matchedCurLangLabel || match.getMatch() == MATCH.NONE){
+            for(Text defaultLangLabel : defaultLabels){
+                matchLabel(match, defaultLangLabel);
+            }
+        }
+        return match;
+    }
+    
+    /**
+     * @param suggestion
+     * @param label
+     */
+    private void matchLabel(Suggestion suggestion, Text label) {
+        String text = label.getText();
+        if(!linkerConfig.isCaseSensitiveMatching()){
+            text = text.toLowerCase(); //TODO use language of label for Locale
+        }
+        //Tokenize the label and remove remove tokens without alpha numerical chars
+        String[] unprocessedLabelTokens = labelTokenizer.tokenize(text,
+            state.getLanguage()); //TODO: maybe check of Pos.Foreign
+        if(unprocessedLabelTokens == null){ //no tokenizer available
+            log.info("Unable to tokenize {} language texts. Will process untokenized label {}",
+                state.getLanguage(),text);
+            unprocessedLabelTokens = new String[]{text}; //there is already a warning
+        }
+        int offset = 0;
+        for(int i=0;i<unprocessedLabelTokens.length;i++){
+            boolean hasAlphaNumericChar = Utils.hasAlphaNumericChar(unprocessedLabelTokens[i]);
+            if(!hasAlphaNumericChar){
+                offset++;
+            } else if(offset > 0){
+                unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
+            }
+        }
+        String[] labelTokens;
+        if(offset == 0){
+            labelTokens = unprocessedLabelTokens;
+        } else {
+            labelTokens = new String[unprocessedLabelTokens.length-offset];
+            System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length);
+        }
+        Set<String> labelTokenSet = new HashSet<String>(
+                Arrays.asList(labelTokens));
+        int foundProcessableTokens = 0;
+        int foundTokens = 0;
+        float foundTokenMatch = 0;
+        //ensure the correct order of the tokens in the suggested entity
+        boolean search = true;
+        int firstFoundIndex = -1;
+        int firstProcessableFoundIndex = -1;
+        int lastFoundIndex = -1;
+        int lastProcessableFoundIndex = -1;
+        int firstFoundLabelIndex = -1;
+        int lastfoundLabelIndex = -1;
+        TokenData currentToken;
+        String currentTokenText;
+        int currentTokenLength;
+        int notFound = 0;
+        int matchedTokensNotWithinProcessableTokenSpan = 0;
+        int foundTokensWithinCoveredProcessableTokens = 0;
+        float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
+        //search for matches within the correct order
+        for(int currentIndex = state.getToken().index;
+                currentIndex < state.getTokens().size() 
+                && search ;currentIndex++){
+            currentToken = state.getTokens().get(currentIndex);
+            if(currentToken.hasAlphaNumeric){
+                currentTokenText = currentToken.getTokenText();
+                if(!linkerConfig.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
+                currentTokenLength = currentTokenText.length();
+                boolean found = false;
+                float matchFactor = 0f;
+                //iteration starts at the next token after the last matched one
+                //so it is OK to skip tokens in the label, but not within the text
+                for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
+                    String labelTokenText = labelTokens[i];
+                    int labelTokenLength = labelTokenText.length();
+                    float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
+                    float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
+                    if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison 
+                        int matchCount = compareTokens(currentTokenText, labelTokenText);
+                        if(matchCount/maxLength >= minTokenMatchFactor){
+                            lastfoundLabelIndex = i; //set the last found index to the current position
+                            found = true; //set found to true -> stops iteration
+                            matchFactor = matchCount/maxLength; //how good is the match
+                            //remove matched labels from the set to disable them for
+                            //a later random oder search
+                            labelTokenSet.remove(labelTokenText);
+                        }
+                    }
+                }
+                if(!found){
+                    //search for a match in the wrong order
+                    //currently only exact matches (for testing)
+                    if(found = labelTokenSet.remove(currentTokenText)){
+                        matchFactor = 0.7f;
+                    }
+                }
+                //int found = text.indexOf(currentToken.getText().toLowerCase());
+                if(found){ //found
+                    if(currentToken.isMatchable){
+                        foundProcessableTokens++; //only count processable Tokens
+                        if(firstProcessableFoundIndex < 0){
+                            firstProcessableFoundIndex = currentIndex;
+                        }
+                        lastProcessableFoundIndex = currentIndex;
+                        foundTokensWithinCoveredProcessableTokens++;
+                        if(matchedTokensNotWithinProcessableTokenSpan > 0){
+                            foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
+                                    matchedTokensNotWithinProcessableTokenSpan;
+                            matchedTokensNotWithinProcessableTokenSpan = 0;
+                        }
+                    } else {
+                        matchedTokensNotWithinProcessableTokenSpan++;
+                    }
+                    foundTokens++;
+                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
+                    if(firstFoundIndex < 0){
+                        firstFoundIndex = currentIndex;
+                        firstFoundLabelIndex = lastfoundLabelIndex;
+                    }
+                    lastFoundIndex = currentIndex;
+                } else { //not found
+                    notFound++;
+                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+                        //stop as soon as a token that needs to be processed is
+                        //not found in the label or the maximum number of tokens
+                        //that are not processable are not found
+                        search = false; 
+                    }
+                }
+            } // else token without alpha or numeric characters are not processed
+        }
+        //search backwards for label tokens until firstFoundLabelIndex if there
+        //are unconsumed Tokens in the sentence before state.getTokenIndex
+        int currentIndex = state.getToken().index-1;
+        int labelIndex = firstFoundLabelIndex-1;
+        notFound = 0;
+        matchedTokensNotWithinProcessableTokenSpan = 0;
+        search = true;
+        while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
+            String labelTokenText = labelTokens[labelIndex];
+            if(labelTokenSet.contains(labelTokenText)){ //still not matched
+                currentToken = state.getTokens().get(currentIndex);
+                currentTokenText = currentToken.getTokenText();
+                if(!linkerConfig.isCaseSensitiveMatching()){
+                    currentTokenText = currentTokenText.toLowerCase();
+                }
+                currentTokenLength = currentTokenText.length();
+                boolean found = false;
+                float matchFactor = 0f;
+                int labelTokenLength = labelTokenText.length();
+                float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
+                float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
+                if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison 
+                    int matchCount = compareTokens(currentTokenText, labelTokenText);
+                    if(matchCount/maxLength >= minTokenMatchFactor){
+                        found = true; //set found to true -> stops iteration
+                        matchFactor = matchCount/maxLength; //how good is the match
+                    }
+                }
+                if(found){ //found
+                    if(currentToken.isMatchable){
+                        foundProcessableTokens++; //only count processable Tokens
+                        firstProcessableFoundIndex = currentIndex;
+                        foundTokensWithinCoveredProcessableTokens++;
+                        if(matchedTokensNotWithinProcessableTokenSpan > 0){
+                            foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
+                                    matchedTokensNotWithinProcessableTokenSpan;
+                            matchedTokensNotWithinProcessableTokenSpan = 0;
+                        }
+                    } else {
+                        matchedTokensNotWithinProcessableTokenSpan++;
+                    }
+                    foundTokens++;
+                    foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
+                    firstFoundIndex = currentIndex;
+                    labelIndex--; 
+                    labelTokenSet.remove(labelTokenText);
+                } else {
+                    notFound++;
+                    if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+                        //stop as soon as a token that needs to be processed is
+                        //not found in the label or the maximum number of tokens
+                        //that are not processable are not found
+                        search = false; 
+                    }
+                }
+                currentIndex --;
+            } else { //this token is already matched ...
+                labelIndex--; //try the next one
+            }
+        }
+        if(foundProcessableTokens > 0) { //if any Token has matched
+            //Now we make a second round to search tokens that match in the wrong order
+            //e.g. if given and family name of persons are switched
+            final LabelMatch labelMatch;
+            int coveredTokens = lastFoundIndex-firstFoundIndex+1;
+            int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
+            //matched tokens only within the span of the first/last processable token
+            //Matching rules
+            // - if less than config#minTokenFound() than accept only EXACT
+            // - override PARTIAL matches with FULL/EXACT matches only if
+            //   foundTokens of the PARTIAL match is > than of the FULL/EXACT
+            //   match (this will be very rare
+            String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
+            if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){ 
+                labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
+            } else {
+                if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
+                    //if all token matched set found to covered: May be lower because only
+                    //processable tokens are counted, but FULL also checks
+                    //of non-processable!
+                    foundTokens = coveredTokens;
+                    foundProcessableTokens = coveredProcessableTokens;
+                }
+                labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens, 
+                    foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
+                    foundTokenMatch/foundTokens,label,labelTokens.length);
+            }
+            if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() && 
+                    labelMatch.getTextScore() >= linkerConfig.getMinTextScore() && 
+                    labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
+                suggestion.addLabelMatch(labelMatch);
+            }
+        } //else NO tokens found -> nothing to do
+    }
+    /**
+     * Compares to token with each other and returns the longest match. The 
+     * tokens are compared from the beginning and from the end.
+     * @param token1 the first token
+     * @param token2 the second token
+     * @return the number of matching chars
+     */
+    private int compareTokens(String token1,String token2){
+        int l1 = token1.length(); //length of the first token
+        int l2 = token2.length(); //length of the second token
+        //in case of same length check for equals first
+        if(l1 == l2 && token1.equals(token2)){ 
+            return l1;
+        }
+        int ml = l1>l2?l2:l1; //minimum length of a token
+        if(ml == 0){
+            return ml;
+        }
+        int f = 0; //forward match count + 1
+        int b = 0; //backward match count + 1
+        boolean match = true; //still matches
+        while(match && f < ml){
+            match = token1.charAt(f) == token2.charAt(f);
+            f++;
+        }
+        if(!match){
+            f--;
+        }
+        if(f < ml){
+            match = true;
+            while(match && b < ml){
+                b++;
+                match = token1.charAt(l1-b) == token2.charAt(l2-b);
+            }
+            if(!match){
+                b--;
+            }
+        }
+        return f > b ? f : b;
+    }
+
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,197 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.Comparator;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+
+public class LabelMatch {
+    /**
+     * To be used in case no match is present
+     */
+    public static final LabelMatch NONE = new LabelMatch();
+    
+    private MATCH match = MATCH.NONE;
+    private int start = 0;
+    private int span = 0;
+    private int processableMatchCount = 0;
+    private Text label;
+    private int labelTokenCount = 0;
+    private double score;
+    /**
+     * The score of the matches (e.g. when a match is based on stemming or some
+     * oder kind of fuzziness, than matchers might assign a match score than
+     * 1.0.
+     */
+    private float tokenMatchScore;
+    private double textScore;
+    private double labelScore;
+    
+    private LabelMatch(){
+        //internally used to create the NONE instance
+    }
+    
+    /**
+     * Creates an {@link MATCH#EXACT} label match
+     * @param start
+     * @param span
+     */
+    protected LabelMatch(int start, int span, Text label){
+        this(start,span,span,span,1f,label,span);
+    }
+    
+    protected LabelMatch(int start, int span,int processableMatchCount, int matchCount, float tokenMatchScore,Text label,int labelTokenCount){
+        if(start < 0){
+            throw new IllegalArgumentException("parsed start position MUST BE >= 0!");
+        }
+        this.start = start;
+        if(span <= 0){
+            throw new IllegalArgumentException("parsed span MUST be > 0!");
+        }
+        this.span = span;
+        if(label == null){
+            throw new NullPointerException("parsed Label MUST NOT be NULL!");
+        }
+        this.label = label;
+        if(processableMatchCount <= 0){
+            match = MATCH.NONE;
+        } else if(processableMatchCount == span){
+            match = MATCH.FULL;
+        } else {
+            match = MATCH.PARTIAL;
+        }
+        if(tokenMatchScore > 1f){
+            throw new IllegalArgumentException("The matchScore MUST NOT be greater than one (parsed value = "+tokenMatchScore+")");
+        }
+        this.tokenMatchScore = tokenMatchScore;
+        this.processableMatchCount = processableMatchCount;
+        this.labelTokenCount = labelTokenCount;
+        //init scores();
+        double suggestionMatchScore = matchCount*this.tokenMatchScore;
+        textScore = suggestionMatchScore/this.span;
+        labelScore = suggestionMatchScore/this.labelTokenCount;
+        score = textScore*labelScore;
+        if(span < processableMatchCount){
+            throw new IllegalArgumentException("The span '" + span
+                + "' MUST BE >= the number of matched processable tokens'"
+                + processableMatchCount+"': "+toString()+"!");
+        }
+        if(span < matchCount){
+            throw new IllegalArgumentException("The span '" + span
+                + "' MUST BE >= the number of matched tokens '"+matchCount+"': "+toString()+"!");
+        }
+        if(processableMatchCount > matchCount){
+            throw new IllegalArgumentException("The number of matched processable tokens '"
+                + processableMatchCount+"' MUST BE <= the number of matched tokens '"
+                + matchCount+"': "+toString()+"!");
+        }
+    }
+
+
+    /**
+     * How well matches the label matches the text span.
+     * Only considers matched tokens of the label. This
+     * value gets low if matches are not exact AND if
+     * some words are not matched at all.
+     * @return
+     */
+    public double getTextScore() {
+        return textScore;
+    }
+    /**
+     * How well matches the label. Sets the tokens of the
+     * Label in relation to the matched tokens in the text. Also
+     * considers that tokens might not match perfectly.<p>
+     * This score get low if the labels defines a lot of additional
+     * tokens that are not present in the Text.
+     * @return
+     */
+    public double getLabelScore() {
+        return labelScore;
+    }
+    /**
+     * The actual label of the {@link #getResult() result} that produced the
+     * based match for the given search tokens.
+     * @return the label
+     */
+    public Text getMatchedLabel() {
+        return label;
+    }
+    /**
+     * Getter for the number of Tokens of the label. Usually needed to calculate
+     * the score (how good the label matches)
+     * @return the labelTokenCount
+     */
+    public int getLabelTokenCount() {
+        return labelTokenCount;
+    }
+    /**
+     * Getter for the the type of the match
+     * @return The type of the match
+     */
+    public MATCH getMatch() {
+        return match;
+    }
+    /**
+     * The overall score how well the label matches the text.
+     * This is the product of the {@link #getLabelScore() labelScore} 
+     * with the {@link #getTextScore()}
+     * @return the overall score [0..1]
+     */
+    public double getMatchScore() {
+        return score;
+    }
+    /**
+     * Getter for the number of the token matched by this suggestion
+     * @return The number of the token matched by this suggestion
+     */
+    public int getSpan() {
+        return span;
+    }
+    /**
+     * Getter for the start index of this Suggestion
+     * @return the start token index for this suggestion
+     */
+    public int getStart() {
+        return start;
+    }
+    /**
+     * Getter for the he number of matching tokens.
+     * @return The number of matching tokens.
+     */
+    public int getMatchCount() {
+        return processableMatchCount;
+    }
+    
+    @Override
+    public String toString() {
+        if(match == MATCH.NONE){
+            return "no match";
+        }
+        StringBuilder sb = new StringBuilder(label.getText());
+        sb.append("[m=").append(match);
+        sb.append(",s=").append(span);
+        sb.append(",c=").append(processableMatchCount).append('(').append(tokenMatchScore).append(")/").append(labelTokenCount);
+        sb.append("] score=").append(score).append("[l=").append(labelScore).append(",t=").append(textScore).append(']');
+        return sb.toString();
+    }
+
+    /**
+     * Compares {@link LabelMatch} first based on the {@link LabelMatch#getMatchCount()} 
+     * number of matched tokens. If the number of the matched tokens is equals or
+     * any of the parsed {@link Suggestion} instances has {@link MATCH#NONE} it
+     * forwards the request to the {@link #MATCH_TYPE_SUGGESTION_COMPARATOR}.
+     */
+    public static final Comparator<LabelMatch> DEFAULT_LABEL_TOKEN_COMPARATOR = new Comparator<LabelMatch>() {
+        @Override
+        public int compare(LabelMatch arg0, LabelMatch arg1) {
+            if(arg0.match == MATCH.NONE || arg1.match == MATCH.NONE ||
+                    arg0.processableMatchCount == arg1.processableMatchCount){
+                return arg1.match.ordinal() - arg0.match.ordinal(); //higher ordinal first
+            } else {
+                return arg1.processableMatchCount - arg0.processableMatchCount; //bigger should be first
+            }
+        }
+    };
+
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,234 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+
+/**
+ * The occurrence of an detected Entity within the content. <p>
+ * Note that this class already stores the information in a structure as needed
+ * to write Enhancements as defined by the upcoming 2nd version of the
+ * Apache Stanbol Enhancement Structure (EntityAnnotation, TextOccurrence and
+ * EntitySuggestion). However it can also be used to write
+ * TextAnnotations and EntityAnnotations as defined by the 1st version
+ * @author Rupert Westenthaler
+ *
+ */
+public class LinkedEntity {
+    /**
+     * An mention of an linked entity within the text
+     * @author Rupert Westenthaler
+     *
+     */
+    public class Occurrence {
+        /**
+         * The maximum number of chars until that the current sentence is used
+         * as context for TextOcccurrences. If the sentence is longer a area of
+         * {@link #CONTEXT_TOKEN_COUNT} before and after the current selected
+         * text is used as context.<p>
+         * This is especially important in case no sentence detector is available
+         * for the current content. Because in this case the whole text is
+         * parsed as a single Sentence.
+         * TODO: Maybe find a more clever way to determine the context
+         */
+        public static final int MAX_CONTEXT_LENGTH = 200;
+        /**
+         * The number of tokens surrounding the current selected text used to
+         * calculate the context if the current sentence is longer than
+         * {@link #MAX_CONTEXT_LENGTH} chars.<p>
+         * This is especially important in case no sentence detector is available
+         * for the current content. Because in this case the whole text is
+         * parsed as a single Sentence.
+         * TODO: Maybe find a more clever way to determine the context
+         */
+        public static final int CONTEXT_TOKEN_COUNT = 5;
+        private final int start;
+        private final int end;
+        private final String context;
+
+        private Occurrence(Section sentence,Token token) {
+            this(sentence,token,token);
+        }
+        private Occurrence(Section sentence,Token start,Token end){
+            this.start = start.getStart();
+            this.end = end.getEnd();
+            String context = sentence.getSpan();
+            if(context.length() > MAX_CONTEXT_LENGTH){
+                context = start.getContext().getSpan().substring(
+                    Math.max(0, this.start-CONTEXT_TOKEN_COUNT),
+                    Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1);
+            }
+            this.context = context;
+        }
+        /**
+         * The context (surrounding text) of the occurrence.
+         * @return
+         */
+        public String getContext() {
+            return context;
+        }
+        /**
+         * The start index of the occurrence
+         * @return the start index relative to the start of the text 
+         */
+        public int getStart() {
+            return start;
+        }
+        /**
+         * the end index of the occurrence
+         * @return the end index relative to the start of the text
+         */
+        public int getEnd() {
+            return end;
+        }
+        /**
+         * The selected text of this occurrence. Actually returns the value
+         * of {@link LinkedEntity#getSelectedText()}, because th
+         * @return
+         */
+        public String getSelectedText(){
+            return LinkedEntity.this.getSelectedText();
+        }
+        @Override
+        public String toString() {
+            return start+","+end;
+        }
+        @Override
+        public int hashCode() {
+            return context.hashCode()+start+end;
+        }
+        @Override
+        public boolean equals(Object arg0) {
+            return arg0 instanceof Occurrence && 
+                ((Occurrence)arg0).start == start &&
+                ((Occurrence)arg0).end == end &&
+                ((Occurrence)arg0).context.equals(context);
+        }
+    }
+    private final String selectedText;
+    private final Set<UriRef> types;
+    private final List<Suggestion> suggestions;
+    private final Collection<Occurrence> occurrences = new ArrayList<Occurrence>();
+    private final Collection<Occurrence> unmodOccurrences = Collections.unmodifiableCollection(occurrences);
+    /**
+     * Creates a new LinkedEntity for the parsed parameters
+     * @param selectedText the selected text
+     * @param suggestions the entity suggestions
+     * @param types the types of the linked entity. 
+     */
+    protected LinkedEntity(String selectedText, List<Suggestion> suggestions, Set<UriRef> types) {
+        this.suggestions = Collections.unmodifiableList(suggestions);
+        this.selectedText = selectedText;
+        this.types = Collections.unmodifiableSet(types);
+    }
+   /**
+     * Creates a new Linked Entity including the first {@link Occurrence}
+     * @param section the sentence (context) for the occurrence.
+     * @param startToken the index of the start token
+     * @param tokenSpan the number of token included in this span
+     * @param suggestions the entity suggestions
+     * @param types the types of the linked entity. 
+     */
+    protected LinkedEntity(Section section,Token startToken,Token endToken, 
+                           List<Suggestion> suggestions, Set<UriRef> types) {
+        this(startToken.getSpan().substring(startToken.getStart(), endToken.getEnd()),
+            suggestions,types);
+        addOccurrence(section, startToken,endToken);
+    }
+    /**
+     * Getter for the selected text
+     * @return the selected text
+     */
+    public String getSelectedText() {
+        return selectedText;
+    }
+    
+    /**
+     * Getter for read only list of types
+     * @return the types
+     */
+    public Set<UriRef> getTypes() {
+        return types;
+    }
+    /**
+     * Adds an new Occurrence
+     * @param sentence the analysed sentence
+     * @param startToken the start token
+     * @param tokenSpan the number of tokens included in this span
+     * @return the new Occurrence also added to {@link #getOccurrences()}
+     */
+    protected Occurrence addOccurrence(Section section,Token startToken,Token tokenSpan){
+        Occurrence o = new Occurrence(section, startToken, tokenSpan);
+        occurrences.add(o);
+        return o;
+    }
+    /**
+     * Getter for the read only list of Occurrences
+     * @return the occurrences
+     */
+    public Collection<Occurrence> getOccurrences(){
+        return unmodOccurrences;
+    }
+    /**
+     * Getter for the read only list of Suggestions
+     * @return the suggestions
+     */
+    public List<Suggestion> getSuggestions(){
+        return suggestions;
+    }
+    
+    /**
+     * Getter for the Score
+     * @return The score of the first element in {@link #getSuggestions()} or 
+     * <code>0</code> if there are no suggestions
+     */
+    public double getScore(){
+        return suggestions.isEmpty() ? 0f : suggestions.get(0).getScore();
+    }
+    
+    /**
+     * Only considers the {@link #getSelectedText()}, because it is assumed that
+     * for the same selected text there MUST BE always the same suggestions with
+     * the same types and occurrences.
+     */
+    @Override
+    public int hashCode() {
+        return selectedText.hashCode();
+    }
+    /**
+     * Only considers the {@link #getSelectedText()}, because it is assumed that
+     * for the same selected text there MUST BE always the same suggestions with
+     * the same types and occurrences.
+     */
+    @Override
+    public boolean equals(Object arg0) {
+        return arg0 instanceof LinkedEntity && 
+        ((LinkedEntity)arg0).selectedText.equals(selectedText);
+    }
+    @Override
+    public String toString() {
+        return selectedText+'@'+occurrences+"->"+suggestions;
+    }
+}

Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,211 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.Constants;
+import org.osgi.framework.ServiceReference;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.util.tracker.ServiceTracker;
+import org.osgi.util.tracker.ServiceTrackerCustomizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate=true)
+@Service
+@Properties(
+    value={ //Ensure that his LabelTokenizer is highest priority
+           @Property(name=Constants.SERVICE_RANKING,intValue=Integer.MAX_VALUE)
+})
+public class MainLabelTokenizer implements LabelTokenizer {
+
+    private static final String[] DEFAULT_LANG_CONF = new String[]{"*"};
+
+    private final Logger log = LoggerFactory.getLogger(MainLabelTokenizer.class);
+    
+    private ServiceTracker labelTokenizerTracker;
+    
+    private static final Comparator<ServiceReference> RANKING_COMPARATOR = new Comparator<ServiceReference>() {
+        
+        public int compare(ServiceReference ref1, ServiceReference ref2) {
+            int r1,r2;
+            Object tmp = ref1.getProperty(Constants.SERVICE_RANKING);
+            r1 = tmp != null ? ((Integer)tmp).intValue() : 0;
+            tmp = (Integer)ref2.getProperty(Constants.SERVICE_RANKING);
+            r2 = tmp != null ? ((Integer)tmp).intValue() : 0;
+            if(r1 == r2){
+                tmp = (Long)ref1.getProperty(Constants.SERVICE_ID);
+                long id1 = tmp != null ? ((Long)tmp).longValue() : Long.MAX_VALUE;
+                tmp = (Long)ref2.getProperty(Constants.SERVICE_ID);
+                long id2 = tmp != null ? ((Long)tmp).longValue() : Long.MAX_VALUE;
+                //the lowest id must be first -> id1 < id2 -> [id1,id2] -> return -1
+                return id1 < id2 ? -1 : id2 == id1 ? 0 : 1; 
+            } else {
+                //the highest ranking MUST BE first -> r1 < r2 -> [r2,r1] -> return 1
+                return r1 < r2 ? 1:-1;
+            }
+        }        
+    };
+    
+    private Map<ServiceReference,LanguageConfiguration> ref2LangConfig = 
+            Collections.synchronizedMap(new HashMap<ServiceReference,LanguageConfiguration>());
+    
+    /**
+     * Lazily initialized keys based on requested languages.
+     * Cleared every time when {@link #ref2LangConfig} changes.
+     */
+    private Map<String,List<ServiceReference>> langTokenizers = 
+            Collections.synchronizedMap(new HashMap<String,List<ServiceReference>>());
+    
+    
+    @Activate
+    protected void activate(ComponentContext ctx){
+        final BundleContext bundleContext = ctx.getBundleContext();
+        final String managerServicePid = (String)ctx.getProperties().get(Constants.SERVICE_PID);
+        labelTokenizerTracker = new ServiceTracker(bundleContext, 
+            LabelTokenizer.class.getName(), 
+            new ServiceTrackerCustomizer() {
+                
+                @Override
+                public Object addingService(ServiceReference reference) {
+                    if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+                        return null; //do not track this manager!
+                    }
+                    LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, DEFAULT_LANG_CONF);
+                    try {
+                        langConf.setConfiguration(reference);
+                    } catch (ConfigurationException e) {
+                        log.error("Unable to track ServiceReference {} becuase of invalid LanguageConfiguration("
+                            + SUPPORTED_LANUAGES+"="+reference.getProperty(SUPPORTED_LANUAGES)+")!",e);
+                        return null;
+                    }
+                    Object service = bundleContext.getService(reference);
+                    if(service != null){
+                        ref2LangConfig.put(reference, langConf);
+                        langTokenizers.clear();
+                    }
+                    return service;
+                }
+
+
+                @Override
+                public void modifiedService(ServiceReference reference, Object service) {
+                    if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+                        return; //ignore this service!
+                    }
+                    LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, DEFAULT_LANG_CONF);
+                    try {
+                        langConf.setConfiguration(reference);
+                        ref2LangConfig.put(reference, langConf);
+                        langTokenizers.clear();
+                    } catch (ConfigurationException e) {
+                        log.error("Unable to track ServiceReference {} becuase of invalid LanguageConfiguration("
+                            + SUPPORTED_LANUAGES+"="+reference.getProperty(SUPPORTED_LANUAGES)+")!",e);
+                        if(ref2LangConfig.remove(reference) != null){
+                            langTokenizers.clear();
+                        }
+                    }
+                }
+
+
+                @Override
+                public void removedService(ServiceReference reference, Object service) {
+                    if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+                        return; //ignore this service
+                    }
+                    bundleContext.ungetService(reference);
+                    if(ref2LangConfig.remove(reference) != null){
+                        langTokenizers.clear();
+                    }
+                }
+            });
+        labelTokenizerTracker.open();
+    }
+    
+    
+    @Deactivate
+    protected void deactivate(ComponentContext ctx){
+        if(labelTokenizerTracker != null){
+            labelTokenizerTracker.close();
+            labelTokenizerTracker = null;
+        }
+    }
+    /**
+     * Getter for the Servcice based on a Service Refernece
+     * @param ref
+     * @return
+     */
+    public LabelTokenizer getService(ServiceReference ref){
+        return (LabelTokenizer) labelTokenizerTracker.getService();
+    }
+    /**
+     * Getter for the list of {@link ServiceReference}s for all
+     * tracked {@link LabelTokenizer} supporting the parsed language.
+     * Entries in the List are sorted by "service.ranking"
+     * @param language
+     * @return
+     */
+    public List<ServiceReference> getTokenizers(String language){
+        List<ServiceReference> langTokenizers = this.langTokenizers.get(language);
+        if(langTokenizers == null ){
+            langTokenizers = initTokenizers(language);
+        }
+        return langTokenizers;
+    }
+
+    
+    private List<ServiceReference> initTokenizers(String language) {
+        List<ServiceReference> tokenizers = new ArrayList<ServiceReference>();
+        if(labelTokenizerTracker.getServiceReferences() != null){
+            for(ServiceReference ref : labelTokenizerTracker.getServiceReferences()){
+                LanguageConfiguration langConf = ref2LangConfig.get(ref);
+                if(langConf != null && langConf.isLanguage(language)){
+                    tokenizers.add(ref);
+                }
+            }
+        }
+        if(tokenizers.size() > 1){
+            Collections.sort(tokenizers,RANKING_COMPARATOR);
+        }
+        this.langTokenizers.put(language, tokenizers);
+        return tokenizers;
+    }
+    
+    /* (non-Javadoc)
+     * @see org.apache.stanbol.enhancer.engines.keywordextraction.impl.LabelTokenizerManager#tokenize(java.lang.String, java.lang.String)
+     */
+    @Override
+    public String[] tokenize(String label,String language){
+        for(ServiceReference ref : getTokenizers(language)){
+            LabelTokenizer tokenizer = (LabelTokenizer)labelTokenizerTracker.getService(ref);
+            if(tokenizer != null){
+                log.trace(" > use Tokenizer {} for language {}",tokenizer.getClass(),language);
+                String[] tokens = tokenizer.tokenize(label, language);
+                if(tokens != null){
+                    if(log.isTraceEnabled()){
+                        log.trace("   - tokenized {} -> {}",label, Arrays.toString(tokens));
+                    }
+                    return tokens;
+                }
+            }
+        }
+        log.warn("No LabelTokenizer availabel for language {} -> return null",language);
+        return null;
+    }
+    
+}