You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:40:11 UTC
svn commit: r1413155 [3/4] - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking: ./
src/ src/license/ src/main/ src/main/java/ src/main/java/org/
src/main/java/org/apache/ src/main/java/org/apache/stanbol/
src/main/java/org/apac...
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,693 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.RedirectProcessingMode;
+import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.ProcessingState.TokenData;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class EntityLinker {
+
+ private final Logger log = LoggerFactory.getLogger(EntityLinker.class);
+
+ private final EntityLinkerConfig linkerConfig;
+ private final LanguageProcessingConfig textProcessingConfig;
+ //private final AnalysedText analysedText;
+ private final EntitySearcher entitySearcher;
+ /**
+ * The state of the current processing
+ */
+ private final ProcessingState state;
+ /**
+ * The map holding the results of the linking process
+ */
+ private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+
+ private Integer lookupLimit;
+
+ private LabelTokenizer labelTokenizer;
+
+
+ public EntityLinker(AnalysedText analysedText, String language,
+ LanguageProcessingConfig textProcessingConfig,
+ EntitySearcher entitySearcher,
+ EntityLinkerConfig linkerConfig,
+ LabelTokenizer labelTokenizer) {
+ //this.analysedText = analysedText;
+ this.entitySearcher = entitySearcher;
+ this.linkerConfig = linkerConfig;
+ this.textProcessingConfig = textProcessingConfig;
+ this.labelTokenizer = labelTokenizer;
+ this.state = new ProcessingState(analysedText,language,textProcessingConfig,linkerConfig);
+ this.lookupLimit = Math.max(10,linkerConfig.getMaxSuggestions()*2);
+ }
+ /**
+ * Steps over the sentences, chunks, tokens of the {@link #sentences}
+ */
+ public void process() throws EngineException {
+ //int debugedIndex = 0;
+ while(state.next()) {
+ TokenData token = state.getToken();
+ if(log.isDebugEnabled()){
+ log.debug("--- preocess Token {}: {} (lemma: {} | pos:{}) chunk: {}",
+ new Object[]{token.index,token.token.getSpan(),
+ token.morpho != null ? token.morpho.getLemma() : "none",
+ token.token.getAnnotations(POS_ANNOTATION),
+ token.inChunk != null ?
+ (token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) :
+ "none"});
+ }
+ List<String> searchStrings = new ArrayList<String>(linkerConfig.getMaxSearchTokens());
+ searchStrings.add(token.getTokenText());
+ //Determine the range we are allowed to search for tokens
+ final int minIncludeIndex;
+ int maxIndcludeIndex;
+ if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks()){
+ minIncludeIndex = Math.max(
+ state.getConsumedIndex()+1,
+ token.inChunk.startToken);
+ maxIndcludeIndex = token.inChunk.endToken;
+ } else {
+ maxIndcludeIndex = state.getTokens().size() - 1;
+ minIncludeIndex = state.getConsumedIndex() + 1;
+ }
+ int prevIndex,pastIndex; //search away from the currently active token
+ int distance = 0;
+ do {
+ distance++;
+ prevIndex = token.index-distance;
+ pastIndex = token.index+distance;
+ if(minIncludeIndex <= prevIndex){
+ TokenData prevToken = state.getTokens().get(prevIndex);
+ if(log.isDebugEnabled()){
+ log.debug(" {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
+ prevToken.isMatchable? '+':'-',prevToken.index,
+ prevToken.token.getSpan(),
+ prevToken.morpho != null ? prevToken.morpho.getLemma() : "none",
+ prevToken.token.getAnnotations(POS_ANNOTATION)
+ });
+ }
+ if(prevToken.isMatchable){
+ searchStrings.add(0,prevToken.getTokenText());
+ }
+ }
+ if(maxIndcludeIndex >= pastIndex){
+ TokenData pastToken = state.getTokens().get(pastIndex);
+ if(log.isDebugEnabled()){
+ log.debug(" {} {}:'{}' (lemma: {} | pos:{})",new Object[]{
+ pastToken.isMatchable? '+':'-',pastToken.index,
+ pastToken.token.getSpan(),
+ pastToken.morpho != null ? pastToken.morpho.getLemma() : "none",
+ pastToken.token.getAnnotations(POS_ANNOTATION)
+ });
+ }
+ if(pastToken.isMatchable){
+ searchStrings.add(pastToken.getTokenText());
+ }
+ }
+ } while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
+ linkerConfig.getMaxSearchDistance() &&
+ (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex));
+ //we might have an additional element in the list
+ if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
+ searchStrings = searchStrings.subList(0, linkerConfig.getMaxSearchTokens());
+ }
+ log.debug(" >> searchStrings {}",searchStrings);
+ //search for Entities
+ List<Suggestion> suggestions = lookupEntities(searchStrings);
+ if(!suggestions.isEmpty()){
+ //update the suggestions based on the best match
+ int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
+ Iterator<Suggestion> it = suggestions.iterator();
+ while(it.hasNext()){
+ Suggestion suggestion = it.next();
+ //suggestions that match less tokens as the best match
+ //need to be updated to PARTIAL
+ int matchCount = suggestion.getLabelMatch().getMatchCount();
+ if(matchCount < bestMatchCount){
+ suggestion.setMatch(MATCH.PARTIAL);
+ }
+ //Filter matches with less than config.getMinFoundTokens()
+ //if matchcount is less than of the best match
+ if(matchCount < bestMatchCount &&
+ matchCount < linkerConfig.getMinFoundTokens()){
+ it.remove();
+ } else { //calculate the score
+ //how good is the current match in relation to the best one
+ double spanScore = matchCount/bestMatchCount;
+ suggestion.setScore(spanScore*spanScore*suggestion.getLabelMatch().getMatchScore());
+ }
+ }
+ Suggestion oldBestRanked = suggestions.get(0); //for debugging
+ //resort by score
+ Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ //this should never happen ... but the
+ //matchcount of the best match MUST NOT change
+ //after the sort by score!
+ if(bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()){
+ log.warn("The match count for the top Ranked Suggestion for {} " +
+ "changed after resorting based on Scores!",
+ state.getTokenText(suggestions.get(0).getLabelMatch().getStart(),bestMatchCount));
+ log.warn(" originalbest : {}",oldBestRanked);
+ log.warn(" currnet ranking : {}",suggestions);
+ log.warn(" ... this will result in worng confidence values relative to the best match");
+ }
+ //remove all suggestions > config.maxSuggestions
+ if(suggestions.size() > linkerConfig.getMaxSuggestions()){
+ suggestions.subList(linkerConfig.getMaxSuggestions(),suggestions.size()).clear();
+ }
+ if(log.isDebugEnabled()){
+ log.debug(" >> Suggestions:");
+ int i=0;
+ for(Suggestion s : suggestions){
+ log.debug(" - {}: {}",i,s);
+ i++;
+ }
+ }
+ //process redirects
+ if(linkerConfig.getRedirectProcessingMode() != RedirectProcessingMode.IGNORE){
+ for(Suggestion suggestion : suggestions){
+ processRedirects(suggestion);
+ }
+ }
+ int start = suggestions.get(0).getLabelMatch().getStart();
+ int span = suggestions.get(0).getLabelMatch().getSpan();
+ //Store the linking results
+ String selectedText = state.getTokenText(start,span);
+ //float score;
+ LinkedEntity linkedEntity = linkedEntities.get(selectedText);
+ if(linkedEntity == null){
+ linkedEntity = new LinkedEntity(selectedText,
+ suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
+ linkedEntities.put(selectedText, linkedEntity);
+ }
+ linkedEntity.addOccurrence(state.getSentence(),
+ //NOTE: The end Token is "start+span-1"
+ state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
+ //set the next token to process to the next word after the
+ //currently found suggestion
+ state.setConsumed(start+span-1);
+ }
+
+ }
+ }
+ /**
+ * After {@link #process()}ing this returns the entities linked for the
+ * parsed {@link AnalysedContent}.
+ * @return the linked entities
+ */
+ public final Map<String,LinkedEntity> getLinkedEntities() {
+ return linkedEntities;
+ }
+ /**
+ * Retrieves all {@link EntitySearcher#getTypeField()} values of the parsed
+ * {@link Suggestion}s and than lookup the {@link NamespaceEnum#dcTerms dc}:type
+ * values for the {@link LinkedEntity#getTypes()} by using the configured
+ * {@link EntityLinkerConfig#getTypeMappings() types mappings} (and if
+ * no mapping is found the {@link EntityLinkerConfig#getDefaultDcType()
+ * default} type.
+ * @param conceptTypes The list of suggestions
+ * @return the types values for the {@link LinkedEntity}
+ */
+ private Set<UriRef> getLinkedEntityTypes(Collection<Suggestion> suggestions){
+ Collection<String> conceptTypes = new HashSet<String>();
+ for(Suggestion suggestion : suggestions){
+ for(Iterator<Reference> types =
+ suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
+ types.hasNext();conceptTypes.add(types.next().getReference()));
+ }
+ Map<String,UriRef> typeMappings = linkerConfig.getTypeMappings();
+ Set<UriRef> dcTypes = new HashSet<UriRef>();
+ for(String conceptType : conceptTypes){
+ UriRef dcType = typeMappings.get(conceptType);
+ if(dcType != null){
+ dcTypes.add(dcType);
+ }
+ }
+ if(dcTypes.isEmpty() && linkerConfig.getDefaultDcType() != null){
+ dcTypes.add(linkerConfig.getDefaultDcType());
+ }
+ return dcTypes;
+ }
+ /**
+ * Processes {@link EntitySearcher#getRedirectField() redirect field} values for
+ * the parsed suggestions based on the {@link RedirectProcessingMode}
+ * as configured in the {@link #config}.<p>
+ * The results of this method are stored within the parsed {@link Suggestion}s
+ * @param suggestion The suggestion to process.
+ */
+ private void processRedirects(Suggestion suggestion) {
+ //if mode is IGNORE -> nothing to do
+ if(linkerConfig.getRedirectProcessingMode() == RedirectProcessingMode.IGNORE){
+ return;
+ }
+ //in case results for queries are locally cached it might be the case
+ //that some/all of the results do already redirects processed.
+ //therefore there is a small internal state that stores this information
+ if(suggestion.isRedirectedProcessed()){
+ return; //Redirects for ResultMatch are already processed ... ignore
+ }
+ Representation result = suggestion.getResult();
+ Iterator<Reference> redirects = result.getReferences(linkerConfig.getRedirectField());
+ switch (linkerConfig.getRedirectProcessingMode()) {
+ case ADD_VALUES:
+ while(redirects.hasNext()){
+ Reference redirect = redirects.next();
+ if(redirect != null){
+ Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+ linkerConfig.getSelectedFields());
+ if(redirectedEntity != null){
+ for(Iterator<String> fields = redirectedEntity.getFieldNames();fields.hasNext();){
+ String field = fields.next();
+ result.add(field, redirectedEntity.get(field));
+ }
+ }
+ //set that the redirects where searched for this result
+ suggestion.setRedirectProcessed(true);
+ }
+ }
+ case FOLLOW:
+ while(redirects.hasNext()){
+ Reference redirect = redirects.next();
+ if(redirect != null){
+ Representation redirectedEntity = entitySearcher.get(redirect.getReference(),
+ linkerConfig.getSelectedFields());
+ if(redirectedEntity != null){
+ //copy the original result score
+ redirectedEntity.set(RdfResourceEnum.resultScore.getUri(),
+ result.get(RdfResourceEnum.resultScore.getUri()));
+ //set the redirect
+ suggestion.setRedirect(redirectedEntity);
+ }
+ }
+ }
+ default: //nothing to do
+ }
+ }
+ /**
+ * Searches for Entities in the {@link #entitySearcher} corresponding to the
+ * {@link Token#getText() words} of the current {@link #state position} in
+ * the text.
+ * @param searchStrings the list of {@link Token#getText() words} to search
+ * entities for.
+ * @return The sorted list with the suggestions.
+ * If there are no suggestions an empty list will be returned.
+ */
+ private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
+ Collection<? extends Representation> results;
+ try {
+ results = entitySearcher.lookup(linkerConfig.getNameField(),
+ linkerConfig.getSelectedFields(),
+ searchStrings,
+ new String[]{state.getLanguage(),linkerConfig.getDefaultLanguage()},
+ lookupLimit);
+ } catch (RuntimeException e) {
+ throw new EngineException(e.getMessage(),e);
+ }
+ log.debug(" - found {} entities ...",results.size());
+ List<Suggestion> suggestions = new ArrayList<Suggestion>();
+ for(Representation result : results){
+ log.debug(" > {}",result.getId());
+ Suggestion suggestion = matchLabels(result);
+ log.debug(" < {}",suggestion);
+ if(suggestion.getMatch() != MATCH.NONE){
+ suggestions.add(suggestion);
+ }
+ }
+ //sort the suggestions
+ if(suggestions.size()>1){
+ Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
+ }
+ //TODO: Work in Progress feature ... allowing to refine search if no
+ // suggestion is found but results where present
+ // However this would need full limit/offset support for the
+ // EntitySearcher. (rwesten 2012-05-21)
+// Integer maxResults = entitySearcher.getLimit();
+// if(maxResults == null){
+// maxResults = 1; //fall back to 1 if limit is not known
+// }
+// if(suggestions.isEmpty() && //if no suggestions where found
+// results.size() >= maxResults && //but the query had max results
+// //than the actual entity might not be within the first LIMIT results
+// searchStrings.size() > 1){ //if multiple words where used for the search
+// //try again with only a single word
+// suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0)));
+//
+// }
+ //remove all elements > config.getMaxSuggestions()
+ return suggestions;
+ }
+ /**
+ * Matches the labels of the parsed {@link Representation} with the Tokens of
+ * the texts (beginning with the currently active
+ * {@link ProcessingState#getToken() token}).<p>
+ * The field used to get the labels is retrieved from
+ * {@link EntitySearcher#getNameField()}. Only labels with no language or the
+ * language of the current sentence are considered. If less than
+ * {@link EntityLinkerConfig#getMinFoundTokens()} tokens match with an
+ * label the Concept is only considered to match if the label is
+ * {@link String#equalsIgnoreCase(String)} to the text covered by the
+ * matched token(s). Otherwise also {@link MATCH#FULL} and {@link MATCH#PARTIAL}
+ * results are allowed.
+ * @param rep The representation including at least the data for the
+ * {@link EntitySearcher#getNameField()} property.
+ * @return The result of the matching.
+ */
+ private Suggestion matchLabels(Representation rep) {
+ String curLang = state.getLanguage(); //language of the current sentence
+ String defLang = linkerConfig.getDefaultLanguage(); //configured default language
+// Iterator<Text> labels = rep.get(config.getNameField(), //get all labels
+// state.getLanguage(), //in the current language
+// config.getDefaultLanguage()); //and the default language
+ Iterator<Text> labels = rep.getText(linkerConfig.getNameField());
+ Suggestion match = new Suggestion(rep);
+ Collection<Text> defaultLabels = new ArrayList<Text>();
+ boolean matchedCurLangLabel = false;
+ while(labels.hasNext()){
+ Text label = labels.next();
+ String lang = label.getLanguage();
+ if((lang == null && curLang == null) ||
+ (lang != null && curLang != null && lang.startsWith(curLang))){
+ matchLabel(match, label);
+ matchedCurLangLabel = true;
+ } else if((lang ==null && defLang == null) ||
+ (lang != null && defLang != null && lang.startsWith(defLang))){
+ defaultLabels.add(label);
+ }
+ }
+ //use only labels in the default language if there is
+ // * no label in the current language or
+ // * no MATCH was found in the current language
+ if(!matchedCurLangLabel || match.getMatch() == MATCH.NONE){
+ for(Text defaultLangLabel : defaultLabels){
+ matchLabel(match, defaultLangLabel);
+ }
+ }
+ return match;
+ }
+
+ /**
+ * @param suggestion
+ * @param label
+ */
+ private void matchLabel(Suggestion suggestion, Text label) {
+ String text = label.getText();
+ if(!linkerConfig.isCaseSensitiveMatching()){
+ text = text.toLowerCase(); //TODO use language of label for Locale
+ }
+ //Tokenize the label and remove remove tokens without alpha numerical chars
+ String[] unprocessedLabelTokens = labelTokenizer.tokenize(text,
+ state.getLanguage()); //TODO: maybe check of Pos.Foreign
+ if(unprocessedLabelTokens == null){ //no tokenizer available
+ log.info("Unable to tokenize {} language texts. Will process untokenized label {}",
+ state.getLanguage(),text);
+ unprocessedLabelTokens = new String[]{text}; //there is already a warning
+ }
+ int offset = 0;
+ for(int i=0;i<unprocessedLabelTokens.length;i++){
+ boolean hasAlphaNumericChar = Utils.hasAlphaNumericChar(unprocessedLabelTokens[i]);
+ if(!hasAlphaNumericChar){
+ offset++;
+ } else if(offset > 0){
+ unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
+ }
+ }
+ String[] labelTokens;
+ if(offset == 0){
+ labelTokens = unprocessedLabelTokens;
+ } else {
+ labelTokens = new String[unprocessedLabelTokens.length-offset];
+ System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length);
+ }
+ Set<String> labelTokenSet = new HashSet<String>(
+ Arrays.asList(labelTokens));
+ int foundProcessableTokens = 0;
+ int foundTokens = 0;
+ float foundTokenMatch = 0;
+ //ensure the correct order of the tokens in the suggested entity
+ boolean search = true;
+ int firstFoundIndex = -1;
+ int firstProcessableFoundIndex = -1;
+ int lastFoundIndex = -1;
+ int lastProcessableFoundIndex = -1;
+ int firstFoundLabelIndex = -1;
+ int lastfoundLabelIndex = -1;
+ TokenData currentToken;
+ String currentTokenText;
+ int currentTokenLength;
+ int notFound = 0;
+ int matchedTokensNotWithinProcessableTokenSpan = 0;
+ int foundTokensWithinCoveredProcessableTokens = 0;
+ float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
+ //search for matches within the correct order
+ for(int currentIndex = state.getToken().index;
+ currentIndex < state.getTokens().size()
+ && search ;currentIndex++){
+ currentToken = state.getTokens().get(currentIndex);
+ if(currentToken.hasAlphaNumeric){
+ currentTokenText = currentToken.getTokenText();
+ if(!linkerConfig.isCaseSensitiveMatching()){
+ currentTokenText = currentTokenText.toLowerCase();
+ }
+ currentTokenLength = currentTokenText.length();
+ boolean found = false;
+ float matchFactor = 0f;
+ //iteration starts at the next token after the last matched one
+ //so it is OK to skip tokens in the label, but not within the text
+ for(int i = lastfoundLabelIndex+1;!found && i < labelTokens.length;i ++){
+ String labelTokenText = labelTokens[i];
+ int labelTokenLength = labelTokenText.length();
+ float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
+ float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
+ if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
+ int matchCount = compareTokens(currentTokenText, labelTokenText);
+ if(matchCount/maxLength >= minTokenMatchFactor){
+ lastfoundLabelIndex = i; //set the last found index to the current position
+ found = true; //set found to true -> stops iteration
+ matchFactor = matchCount/maxLength; //how good is the match
+ //remove matched labels from the set to disable them for
+ //a later random oder search
+ labelTokenSet.remove(labelTokenText);
+ }
+ }
+ }
+ if(!found){
+ //search for a match in the wrong order
+ //currently only exact matches (for testing)
+ if(found = labelTokenSet.remove(currentTokenText)){
+ matchFactor = 0.7f;
+ }
+ }
+ //int found = text.indexOf(currentToken.getText().toLowerCase());
+ if(found){ //found
+ if(currentToken.isMatchable){
+ foundProcessableTokens++; //only count processable Tokens
+ if(firstProcessableFoundIndex < 0){
+ firstProcessableFoundIndex = currentIndex;
+ }
+ lastProcessableFoundIndex = currentIndex;
+ foundTokensWithinCoveredProcessableTokens++;
+ if(matchedTokensNotWithinProcessableTokenSpan > 0){
+ foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
+ matchedTokensNotWithinProcessableTokenSpan;
+ matchedTokensNotWithinProcessableTokenSpan = 0;
+ }
+ } else {
+ matchedTokensNotWithinProcessableTokenSpan++;
+ }
+ foundTokens++;
+ foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
+ if(firstFoundIndex < 0){
+ firstFoundIndex = currentIndex;
+ firstFoundLabelIndex = lastfoundLabelIndex;
+ }
+ lastFoundIndex = currentIndex;
+ } else { //not found
+ notFound++;
+ if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of tokens
+ //that are not processable are not found
+ search = false;
+ }
+ }
+ } // else token without alpha or numeric characters are not processed
+ }
+ //search backwards for label tokens until firstFoundLabelIndex if there
+ //are unconsumed Tokens in the sentence before state.getTokenIndex
+ int currentIndex = state.getToken().index-1;
+ int labelIndex = firstFoundLabelIndex-1;
+ notFound = 0;
+ matchedTokensNotWithinProcessableTokenSpan = 0;
+ search = true;
+ while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
+ String labelTokenText = labelTokens[labelIndex];
+ if(labelTokenSet.contains(labelTokenText)){ //still not matched
+ currentToken = state.getTokens().get(currentIndex);
+ currentTokenText = currentToken.getTokenText();
+ if(!linkerConfig.isCaseSensitiveMatching()){
+ currentTokenText = currentTokenText.toLowerCase();
+ }
+ currentTokenLength = currentTokenText.length();
+ boolean found = false;
+ float matchFactor = 0f;
+ int labelTokenLength = labelTokenText.length();
+ float maxLength = currentTokenLength > labelTokenLength ? currentTokenLength : labelTokenLength;
+ float lengthDif = Math.abs(currentTokenLength - labelTokenLength);
+ if((lengthDif/maxLength)<=(1-minTokenMatchFactor)){ //this prevents unnecessary string comparison
+ int matchCount = compareTokens(currentTokenText, labelTokenText);
+ if(matchCount/maxLength >= minTokenMatchFactor){
+ found = true; //set found to true -> stops iteration
+ matchFactor = matchCount/maxLength; //how good is the match
+ }
+ }
+ if(found){ //found
+ if(currentToken.isMatchable){
+ foundProcessableTokens++; //only count processable Tokens
+ firstProcessableFoundIndex = currentIndex;
+ foundTokensWithinCoveredProcessableTokens++;
+ if(matchedTokensNotWithinProcessableTokenSpan > 0){
+ foundTokensWithinCoveredProcessableTokens = foundTokensWithinCoveredProcessableTokens +
+ matchedTokensNotWithinProcessableTokenSpan;
+ matchedTokensNotWithinProcessableTokenSpan = 0;
+ }
+ } else {
+ matchedTokensNotWithinProcessableTokenSpan++;
+ }
+ foundTokens++;
+ foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
+ firstFoundIndex = currentIndex;
+ labelIndex--;
+ labelTokenSet.remove(labelTokenText);
+ } else {
+ notFound++;
+ if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+ //stop as soon as a token that needs to be processed is
+ //not found in the label or the maximum number of tokens
+ //that are not processable are not found
+ search = false;
+ }
+ }
+ currentIndex --;
+ } else { //this token is already matched ...
+ labelIndex--; //try the next one
+ }
+ }
+ if(foundProcessableTokens > 0) { //if any Token has matched
+ //Now we make a second round to search tokens that match in the wrong order
+ //e.g. if given and family name of persons are switched
+ final LabelMatch labelMatch;
+ int coveredTokens = lastFoundIndex-firstFoundIndex+1;
+ int coveredProcessableTokens = lastProcessableFoundIndex-firstProcessableFoundIndex+1;
+ //matched tokens only within the span of the first/last processable token
+ //Matching rules
+ // - if less than config#minTokenFound() than accept only EXACT
+ // - override PARTIAL matches with FULL/EXACT matches only if
+ // foundTokens of the PARTIAL match is > than of the FULL/EXACT
+ // match (this will be very rare
+ String currentText = state.getTokenText(firstFoundIndex,coveredTokens);
+ if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
+ labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
+ } else {
+ if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
+ //if all token matched set found to covered: May be lower because only
+ //processable tokens are counted, but FULL also checks
+ //of non-processable!
+ foundTokens = coveredTokens;
+ foundProcessableTokens = coveredProcessableTokens;
+ }
+ labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens,
+ foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
+ foundTokenMatch/foundTokens,label,labelTokens.length);
+ }
+ if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
+ labelMatch.getTextScore() >= linkerConfig.getMinTextScore() &&
+ labelMatch.getMatchScore() >= linkerConfig.getMinMatchScore()){
+ suggestion.addLabelMatch(labelMatch);
+ }
+ } //else NO tokens found -> nothing to do
+ }
+ /**
+ * Compares to token with each other and returns the longest match. The
+ * tokens are compared from the beginning and from the end.
+ * @param token1 the first token
+ * @param token2 the second token
+ * @return the number of matching chars
+ */
+ private int compareTokens(String token1,String token2){
+ int l1 = token1.length(); //length of the first token
+ int l2 = token2.length(); //length of the second token
+ //in case of same length check for equals first
+ if(l1 == l2 && token1.equals(token2)){
+ return l1;
+ }
+ int ml = l1>l2?l2:l1; //minimum length of a token
+ if(ml == 0){
+ return ml;
+ }
+ int f = 0; //forward match count + 1
+ int b = 0; //backward match count + 1
+ boolean match = true; //still matches
+ while(match && f < ml){
+ match = token1.charAt(f) == token2.charAt(f);
+ f++;
+ }
+ if(!match){
+ f--;
+ }
+ if(f < ml){
+ match = true;
+ while(match && b < ml){
+ b++;
+ match = token1.charAt(l1-b) == token2.charAt(l2-b);
+ }
+ if(!match){
+ b--;
+ }
+ }
+ return f > b ? f : b;
+ }
+
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,197 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.Comparator;
+
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.MATCH;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
+
+public class LabelMatch {
+ /**
+ * To be used in case no match is present
+ */
+ public static final LabelMatch NONE = new LabelMatch();
+
+ private MATCH match = MATCH.NONE;
+ private int start = 0;
+ private int span = 0;
+ private int processableMatchCount = 0;
+ private Text label;
+ private int labelTokenCount = 0;
+ private double score;
+ /**
+ * The score of the matches (e.g. when a match is based on stemming or some
+ * oder kind of fuzziness, than matchers might assign a match score than
+ * 1.0.
+ */
+ private float tokenMatchScore;
+ private double textScore;
+ private double labelScore;
+
+ private LabelMatch(){
+ //internally used to create the NONE instance
+ }
+
+ /**
+ * Creates an {@link MATCH#EXACT} label match
+ * @param start
+ * @param span
+ */
+ protected LabelMatch(int start, int span, Text label){
+ this(start,span,span,span,1f,label,span);
+ }
+
+ protected LabelMatch(int start, int span,int processableMatchCount, int matchCount, float tokenMatchScore,Text label,int labelTokenCount){
+ if(start < 0){
+ throw new IllegalArgumentException("parsed start position MUST BE >= 0!");
+ }
+ this.start = start;
+ if(span <= 0){
+ throw new IllegalArgumentException("parsed span MUST be > 0!");
+ }
+ this.span = span;
+ if(label == null){
+ throw new NullPointerException("parsed Label MUST NOT be NULL!");
+ }
+ this.label = label;
+ if(processableMatchCount <= 0){
+ match = MATCH.NONE;
+ } else if(processableMatchCount == span){
+ match = MATCH.FULL;
+ } else {
+ match = MATCH.PARTIAL;
+ }
+ if(tokenMatchScore > 1f){
+ throw new IllegalArgumentException("The matchScore MUST NOT be greater than one (parsed value = "+tokenMatchScore+")");
+ }
+ this.tokenMatchScore = tokenMatchScore;
+ this.processableMatchCount = processableMatchCount;
+ this.labelTokenCount = labelTokenCount;
+ //init scores();
+ double suggestionMatchScore = matchCount*this.tokenMatchScore;
+ textScore = suggestionMatchScore/this.span;
+ labelScore = suggestionMatchScore/this.labelTokenCount;
+ score = textScore*labelScore;
+ if(span < processableMatchCount){
+ throw new IllegalArgumentException("The span '" + span
+ + "' MUST BE >= the number of matched processable tokens'"
+ + processableMatchCount+"': "+toString()+"!");
+ }
+ if(span < matchCount){
+ throw new IllegalArgumentException("The span '" + span
+ + "' MUST BE >= the number of matched tokens '"+matchCount+"': "+toString()+"!");
+ }
+ if(processableMatchCount > matchCount){
+ throw new IllegalArgumentException("The number of matched processable tokens '"
+ + processableMatchCount+"' MUST BE <= the number of matched tokens '"
+ + matchCount+"': "+toString()+"!");
+ }
+ }
+
+
+ /**
+ * How well matches the label matches the text span.
+ * Only considers matched tokens of the label. This
+ * value gets low if matches are not exact AND if
+ * some words are not matched at all.
+ * @return
+ */
+ public double getTextScore() {
+ return textScore;
+ }
+ /**
+ * How well matches the label. Sets the tokens of the
+ * Label in relation to the matched tokens in the text. Also
+ * considers that tokens might not match perfectly.<p>
+ * This score get low if the labels defines a lot of additional
+ * tokens that are not present in the Text.
+ * @return
+ */
+ public double getLabelScore() {
+ return labelScore;
+ }
+ /**
+ * The actual label of the {@link #getResult() result} that produced the
+ * based match for the given search tokens.
+ * @return the label
+ */
+ public Text getMatchedLabel() {
+ return label;
+ }
+ /**
+ * Getter for the number of Tokens of the label. Usually needed to calculate
+ * the score (how good the label matches)
+ * @return the labelTokenCount
+ */
+ public int getLabelTokenCount() {
+ return labelTokenCount;
+ }
+ /**
+ * Getter for the the type of the match
+ * @return The type of the match
+ */
+ public MATCH getMatch() {
+ return match;
+ }
+ /**
+ * The overall score how well the label matches the text.
+ * This is the product of the {@link #getLabelScore() labelScore}
+ * with the {@link #getTextScore()}
+ * @return the overall score [0..1]
+ */
+ public double getMatchScore() {
+ return score;
+ }
+ /**
+ * Getter for the number of the token matched by this suggestion
+ * @return The number of the token matched by this suggestion
+ */
+ public int getSpan() {
+ return span;
+ }
+ /**
+ * Getter for the start index of this Suggestion
+ * @return the start token index for this suggestion
+ */
+ public int getStart() {
+ return start;
+ }
+ /**
+ * Getter for the he number of matching tokens.
+ * @return The number of matching tokens.
+ */
+ public int getMatchCount() {
+ return processableMatchCount;
+ }
+
+ @Override
+ public String toString() {
+ if(match == MATCH.NONE){
+ return "no match";
+ }
+ StringBuilder sb = new StringBuilder(label.getText());
+ sb.append("[m=").append(match);
+ sb.append(",s=").append(span);
+ sb.append(",c=").append(processableMatchCount).append('(').append(tokenMatchScore).append(")/").append(labelTokenCount);
+ sb.append("] score=").append(score).append("[l=").append(labelScore).append(",t=").append(textScore).append(']');
+ return sb.toString();
+ }
+
+ /**
+ * Compares {@link LabelMatch} first based on the {@link LabelMatch#getMatchCount()}
+ * number of matched tokens. If the number of the matched tokens is equals or
+ * any of the parsed {@link Suggestion} instances has {@link MATCH#NONE} it
+ * forwards the request to the {@link #MATCH_TYPE_SUGGESTION_COMPARATOR}.
+ */
+ public static final Comparator<LabelMatch> DEFAULT_LABEL_TOKEN_COMPARATOR = new Comparator<LabelMatch>() {
+ @Override
+ public int compare(LabelMatch arg0, LabelMatch arg1) {
+ if(arg0.match == MATCH.NONE || arg1.match == MATCH.NONE ||
+ arg0.processableMatchCount == arg1.processableMatchCount){
+ return arg1.match.ordinal() - arg0.match.ordinal(); //higher ordinal first
+ } else {
+ return arg1.processableMatchCount - arg0.processableMatchCount; //bigger should be first
+ }
+ }
+ };
+
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LinkedEntity.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,234 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+
+/**
+ * The occurrence of an detected Entity within the content. <p>
+ * Note that this class already stores the information in a structure as needed
+ * to write Enhancements as defined by the upcoming 2nd version of the
+ * Apache Stanbol Enhancement Structure (EntityAnnotation, TextOccurrence and
+ * EntitySuggestion). However it can also be used to write
+ * TextAnnotations and EntityAnnotations as defined by the 1st version
+ * @author Rupert Westenthaler
+ *
+ */
+public class LinkedEntity {
+ /**
+ * An mention of an linked entity within the text
+ * @author Rupert Westenthaler
+ *
+ */
+ public class Occurrence {
+ /**
+ * The maximum number of chars until that the current sentence is used
+ * as context for TextOcccurrences. If the sentence is longer a area of
+ * {@link #CONTEXT_TOKEN_COUNT} before and after the current selected
+ * text is used as context.<p>
+ * This is especially important in case no sentence detector is available
+ * for the current content. Because in this case the whole text is
+ * parsed as a single Sentence.
+ * TODO: Maybe find a more clever way to determine the context
+ */
+ public static final int MAX_CONTEXT_LENGTH = 200;
+ /**
+ * The number of tokens surrounding the current selected text used to
+ * calculate the context if the current sentence is longer than
+ * {@link #MAX_CONTEXT_LENGTH} chars.<p>
+ * This is especially important in case no sentence detector is available
+ * for the current content. Because in this case the whole text is
+ * parsed as a single Sentence.
+ * TODO: Maybe find a more clever way to determine the context
+ */
+ public static final int CONTEXT_TOKEN_COUNT = 5;
+ private final int start;
+ private final int end;
+ private final String context;
+
+ private Occurrence(Section sentence,Token token) {
+ this(sentence,token,token);
+ }
+ private Occurrence(Section sentence,Token start,Token end){
+ this.start = start.getStart();
+ this.end = end.getEnd();
+ String context = sentence.getSpan();
+ if(context.length() > MAX_CONTEXT_LENGTH){
+ context = start.getContext().getSpan().substring(
+ Math.max(0, this.start-CONTEXT_TOKEN_COUNT),
+ Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1);
+ }
+ this.context = context;
+ }
+ /**
+ * The context (surrounding text) of the occurrence.
+ * @return
+ */
+ public String getContext() {
+ return context;
+ }
+ /**
+ * The start index of the occurrence
+ * @return the start index relative to the start of the text
+ */
+ public int getStart() {
+ return start;
+ }
+ /**
+ * the end index of the occurrence
+ * @return the end index relative to the start of the text
+ */
+ public int getEnd() {
+ return end;
+ }
+ /**
+ * The selected text of this occurrence. Actually returns the value
+ * of {@link LinkedEntity#getSelectedText()}, because th
+ * @return
+ */
+ public String getSelectedText(){
+ return LinkedEntity.this.getSelectedText();
+ }
+ @Override
+ public String toString() {
+ return start+","+end;
+ }
+ @Override
+ public int hashCode() {
+ return context.hashCode()+start+end;
+ }
+ @Override
+ public boolean equals(Object arg0) {
+ return arg0 instanceof Occurrence &&
+ ((Occurrence)arg0).start == start &&
+ ((Occurrence)arg0).end == end &&
+ ((Occurrence)arg0).context.equals(context);
+ }
+ }
+ private final String selectedText;
+ private final Set<UriRef> types;
+ private final List<Suggestion> suggestions;
+ private final Collection<Occurrence> occurrences = new ArrayList<Occurrence>();
+ private final Collection<Occurrence> unmodOccurrences = Collections.unmodifiableCollection(occurrences);
+ /**
+ * Creates a new LinkedEntity for the parsed parameters
+ * @param selectedText the selected text
+ * @param suggestions the entity suggestions
+ * @param types the types of the linked entity.
+ */
+ protected LinkedEntity(String selectedText, List<Suggestion> suggestions, Set<UriRef> types) {
+ this.suggestions = Collections.unmodifiableList(suggestions);
+ this.selectedText = selectedText;
+ this.types = Collections.unmodifiableSet(types);
+ }
+ /**
+ * Creates a new Linked Entity including the first {@link Occurrence}
+ * @param section the sentence (context) for the occurrence.
+ * @param startToken the index of the start token
+ * @param tokenSpan the number of token included in this span
+ * @param suggestions the entity suggestions
+ * @param types the types of the linked entity.
+ */
+ protected LinkedEntity(Section section,Token startToken,Token endToken,
+ List<Suggestion> suggestions, Set<UriRef> types) {
+ this(startToken.getSpan().substring(startToken.getStart(), endToken.getEnd()),
+ suggestions,types);
+ addOccurrence(section, startToken,endToken);
+ }
+ /**
+ * Getter for the selected text
+ * @return the selected text
+ */
+ public String getSelectedText() {
+ return selectedText;
+ }
+
+ /**
+ * Getter for read only list of types
+ * @return the types
+ */
+ public Set<UriRef> getTypes() {
+ return types;
+ }
+ /**
+ * Adds an new Occurrence
+ * @param sentence the analysed sentence
+ * @param startToken the start token
+ * @param tokenSpan the number of tokens included in this span
+ * @return the new Occurrence also added to {@link #getOccurrences()}
+ */
+ protected Occurrence addOccurrence(Section section,Token startToken,Token tokenSpan){
+ Occurrence o = new Occurrence(section, startToken, tokenSpan);
+ occurrences.add(o);
+ return o;
+ }
+ /**
+ * Getter for the read only list of Occurrences
+ * @return the occurrences
+ */
+ public Collection<Occurrence> getOccurrences(){
+ return unmodOccurrences;
+ }
+ /**
+ * Getter for the read only list of Suggestions
+ * @return the suggestions
+ */
+ public List<Suggestion> getSuggestions(){
+ return suggestions;
+ }
+
+ /**
+ * Getter for the Score
+ * @return The score of the first element in {@link #getSuggestions()} or
+ * <code>0</code> if there are no suggestions
+ */
+ public double getScore(){
+ return suggestions.isEmpty() ? 0f : suggestions.get(0).getScore();
+ }
+
+ /**
+ * Only considers the {@link #getSelectedText()}, because it is assumed that
+ * for the same selected text there MUST BE always the same suggestions with
+ * the same types and occurrences.
+ */
+ @Override
+ public int hashCode() {
+ return selectedText.hashCode();
+ }
+ /**
+ * Only considers the {@link #getSelectedText()}, because it is assumed that
+ * for the same selected text there MUST BE always the same suggestions with
+ * the same types and occurrences.
+ */
+ @Override
+ public boolean equals(Object arg0) {
+ return arg0 instanceof LinkedEntity &&
+ ((LinkedEntity)arg0).selectedText.equals(selectedText);
+ }
+ @Override
+ public String toString() {
+ return selectedText+'@'+occurrences+"->"+suggestions;
+ }
+}
Added: stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java?rev=1413155&view=auto
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java (added)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/MainLabelTokenizer.java Sat Nov 24 09:40:08 2012
@@ -0,0 +1,211 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer;
+import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.Constants;
+import org.osgi.framework.ServiceReference;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.util.tracker.ServiceTracker;
+import org.osgi.util.tracker.ServiceTrackerCustomizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate=true)
+@Service
+@Properties(
+ value={ //Ensure that his LabelTokenizer is highest priority
+ @Property(name=Constants.SERVICE_RANKING,intValue=Integer.MAX_VALUE)
+})
+public class MainLabelTokenizer implements LabelTokenizer {
+
+ private static final String[] DEFAULT_LANG_CONF = new String[]{"*"};
+
+ private final Logger log = LoggerFactory.getLogger(MainLabelTokenizer.class);
+
+ private ServiceTracker labelTokenizerTracker;
+
+ private static final Comparator<ServiceReference> RANKING_COMPARATOR = new Comparator<ServiceReference>() {
+
+ public int compare(ServiceReference ref1, ServiceReference ref2) {
+ int r1,r2;
+ Object tmp = ref1.getProperty(Constants.SERVICE_RANKING);
+ r1 = tmp != null ? ((Integer)tmp).intValue() : 0;
+ tmp = (Integer)ref2.getProperty(Constants.SERVICE_RANKING);
+ r2 = tmp != null ? ((Integer)tmp).intValue() : 0;
+ if(r1 == r2){
+ tmp = (Long)ref1.getProperty(Constants.SERVICE_ID);
+ long id1 = tmp != null ? ((Long)tmp).longValue() : Long.MAX_VALUE;
+ tmp = (Long)ref2.getProperty(Constants.SERVICE_ID);
+ long id2 = tmp != null ? ((Long)tmp).longValue() : Long.MAX_VALUE;
+ //the lowest id must be first -> id1 < id2 -> [id1,id2] -> return -1
+ return id1 < id2 ? -1 : id2 == id1 ? 0 : 1;
+ } else {
+ //the highest ranking MUST BE first -> r1 < r2 -> [r2,r1] -> return 1
+ return r1 < r2 ? 1:-1;
+ }
+ }
+ };
+
+ private Map<ServiceReference,LanguageConfiguration> ref2LangConfig =
+ Collections.synchronizedMap(new HashMap<ServiceReference,LanguageConfiguration>());
+
+ /**
+ * Lazily initialized keys based on requested languages.
+ * Cleared every time when {@link #ref2LangConfig} changes.
+ */
+ private Map<String,List<ServiceReference>> langTokenizers =
+ Collections.synchronizedMap(new HashMap<String,List<ServiceReference>>());
+
+
+ @Activate
+ protected void activate(ComponentContext ctx){
+ final BundleContext bundleContext = ctx.getBundleContext();
+ final String managerServicePid = (String)ctx.getProperties().get(Constants.SERVICE_PID);
+ labelTokenizerTracker = new ServiceTracker(bundleContext,
+ LabelTokenizer.class.getName(),
+ new ServiceTrackerCustomizer() {
+
+ @Override
+ public Object addingService(ServiceReference reference) {
+ if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+ return null; //do not track this manager!
+ }
+ LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, DEFAULT_LANG_CONF);
+ try {
+ langConf.setConfiguration(reference);
+ } catch (ConfigurationException e) {
+ log.error("Unable to track ServiceReference {} becuase of invalid LanguageConfiguration("
+ + SUPPORTED_LANUAGES+"="+reference.getProperty(SUPPORTED_LANUAGES)+")!",e);
+ return null;
+ }
+ Object service = bundleContext.getService(reference);
+ if(service != null){
+ ref2LangConfig.put(reference, langConf);
+ langTokenizers.clear();
+ }
+ return service;
+ }
+
+
+ @Override
+ public void modifiedService(ServiceReference reference, Object service) {
+ if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+ return; //ignore this service!
+ }
+ LanguageConfiguration langConf = new LanguageConfiguration(SUPPORTED_LANUAGES, DEFAULT_LANG_CONF);
+ try {
+ langConf.setConfiguration(reference);
+ ref2LangConfig.put(reference, langConf);
+ langTokenizers.clear();
+ } catch (ConfigurationException e) {
+ log.error("Unable to track ServiceReference {} becuase of invalid LanguageConfiguration("
+ + SUPPORTED_LANUAGES+"="+reference.getProperty(SUPPORTED_LANUAGES)+")!",e);
+ if(ref2LangConfig.remove(reference) != null){
+ langTokenizers.clear();
+ }
+ }
+ }
+
+
+ @Override
+ public void removedService(ServiceReference reference, Object service) {
+ if(managerServicePid.equals(reference.getProperty(Constants.SERVICE_PID))){
+ return; //ignore this service
+ }
+ bundleContext.ungetService(reference);
+ if(ref2LangConfig.remove(reference) != null){
+ langTokenizers.clear();
+ }
+ }
+ });
+ labelTokenizerTracker.open();
+ }
+
+
+ @Deactivate
+ protected void deactivate(ComponentContext ctx){
+ if(labelTokenizerTracker != null){
+ labelTokenizerTracker.close();
+ labelTokenizerTracker = null;
+ }
+ }
+ /**
+ * Getter for the Servcice based on a Service Refernece
+ * @param ref
+ * @return
+ */
+ public LabelTokenizer getService(ServiceReference ref){
+ return (LabelTokenizer) labelTokenizerTracker.getService();
+ }
+ /**
+ * Getter for the list of {@link ServiceReference}s for all
+ * tracked {@link LabelTokenizer} supporting the parsed language.
+ * Entries in the List are sorted by "service.ranking"
+ * @param language
+ * @return
+ */
+ public List<ServiceReference> getTokenizers(String language){
+ List<ServiceReference> langTokenizers = this.langTokenizers.get(language);
+ if(langTokenizers == null ){
+ langTokenizers = initTokenizers(language);
+ }
+ return langTokenizers;
+ }
+
+
+ private List<ServiceReference> initTokenizers(String language) {
+ List<ServiceReference> tokenizers = new ArrayList<ServiceReference>();
+ if(labelTokenizerTracker.getServiceReferences() != null){
+ for(ServiceReference ref : labelTokenizerTracker.getServiceReferences()){
+ LanguageConfiguration langConf = ref2LangConfig.get(ref);
+ if(langConf != null && langConf.isLanguage(language)){
+ tokenizers.add(ref);
+ }
+ }
+ }
+ if(tokenizers.size() > 1){
+ Collections.sort(tokenizers,RANKING_COMPARATOR);
+ }
+ this.langTokenizers.put(language, tokenizers);
+ return tokenizers;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.stanbol.enhancer.engines.keywordextraction.impl.LabelTokenizerManager#tokenize(java.lang.String, java.lang.String)
+ */
+ @Override
+ public String[] tokenize(String label,String language){
+ for(ServiceReference ref : getTokenizers(language)){
+ LabelTokenizer tokenizer = (LabelTokenizer)labelTokenizerTracker.getService(ref);
+ if(tokenizer != null){
+ log.trace(" > use Tokenizer {} for language {}",tokenizer.getClass(),language);
+ String[] tokens = tokenizer.tokenize(label, language);
+ if(tokens != null){
+ if(log.isTraceEnabled()){
+ log.trace(" - tokenized {} -> {}",label, Arrays.toString(tokens));
+ }
+ return tokens;
+ }
+ }
+ }
+ log.warn("No LabelTokenizer availabel for language {} -> return null",language);
+ return null;
+ }
+
+}