You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/06/25 08:47:13 UTC
svn commit: r1496359 [2/2] - in /stanbol/trunk/enhancement-engines:
entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/
entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/
entityhubli...
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Tue Jun 25 06:47:12 2013
@@ -17,10 +17,8 @@
package org.apache.stanbol.enhancer.engines.entitylinking.impl;
import static org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion.ENTITY_RANK_COMPARATOR;
-import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -28,13 +26,16 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.NavigableMap;
import java.util.Set;
+import java.util.TreeMap;
-import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException;
@@ -53,6 +54,8 @@ import org.slf4j.LoggerFactory;
public class EntityLinker {
+ private static final int MIN_SEARCH_LIMIT = 10;
+
private final Logger log = LoggerFactory.getLogger(EntityLinker.class);
private final EntityLinkerConfig linkerConfig;
@@ -68,12 +71,30 @@ public class EntityLinker {
*/
private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
- private Integer lookupLimit;
+ //private Integer lookupLimit;
private LabelTokenizer labelTokenizer;
private LinkingStateAware linkingStateAware;
+
+ private int minSearchResults;
+ //Language configuration
+ final String documentLang;
+ final String defaultLang;
+ final String documentMainLang;
+
+ private Statistic textProcessingStats = new Statistic("Text Processing");
+ private Statistic lookupStats = new Statistic("Vocabulary Lookup");
+ private int numQueryResults = 0;
+ private int numFilteredResults = 0;
+ private Statistic matchingStats = new Statistic("Label Matching");
+ private Statistic rankingStats = new Statistic("Suggestion Ranking");
+// private Statistic test = new Statistic("test1");
+// private Statistic test2_ = new Statistic("test2");
+ private int numLabels = 0;
+ private long processingTime = -1;
+
public EntityLinker(AnalysedText analysedText, String language,
LanguageProcessingConfig textProcessingConfig,
@@ -93,15 +114,29 @@ public class EntityLinker {
this.textProcessingConfig = textProcessingConfig;
this.labelTokenizer = labelTokenizer;
this.state = new ProcessingState(analysedText,language,textProcessingConfig);
- this.lookupLimit = Math.max(10,linkerConfig.getMaxSuggestions()*2);
+ minSearchResults = entitySearcher.getLimit() == null ? MIN_SEARCH_LIMIT :
+ Math.max(MIN_SEARCH_LIMIT,entitySearcher.getLimit());
+ //this.lookupLimit = Math.max(minResults,linkerConfig.getMaxSuggestions()*3);
this.linkingStateAware = linkingStateAware;
+ //init the language settings
+ this.documentLang = state.getLanguage();
+ this.defaultLang = linkerConfig.getDefaultLanguage();
+ int countryCodeIndex = documentLang == null ? -1 : documentLang.indexOf('-');
+ if(countryCodeIndex >= 2){
+ documentMainLang = documentLang.substring(0,countryCodeIndex);
+ } else {
+ documentMainLang = null;
+ }
+
}
/**
* Steps over the sentences, chunks, tokens of the {@link #sentences}
*/
public void process() throws EntitySearcherException {
+ long startTime = System.currentTimeMillis();
//int debugedIndex = 0;
Section sentence = null;
+ textProcessingStats.begin();
while(state.next()) {
//STANBOL-1070: added linkingStateAware callbacks for components that
// need to react on the state of the Linking process
@@ -122,56 +157,70 @@ public class EntityLinker {
token.isLinkable, token.isMatchable, token.inChunk != null ?
(token.inChunk.chunk + " "+ token.inChunk.chunk.getSpan()) : "none"});
}
- List<String> searchStrings = new ArrayList<String>(linkerConfig.getMaxSearchTokens());
- String searchString = linkerConfig.isLemmaMatching() ? token.getTokenLemma() :
- token.getTokenText();
- if(searchString == null){
- searchString = token.getTokenText();
- }
- searchStrings.add(searchString);
+ List<TokenData> searchStrings = new ArrayList<TokenData>(linkerConfig.getMaxSearchTokens());
+ getSearchString(token);
+ searchStrings.add(token);
//Determine the range we are allowed to search for tokens
final int minIncludeIndex;
final int maxIndcludeIndex;
//NOTE: testing has shown that using Chunks to restrict search for
// additional matchable tokens does have an negative impact on
// recall. Because of that this restriction is for now deactivated
- boolean restrirctContextByChunks = false; //TODO: maybe make configurable
+ //TODO: maybe make configurable via an own property
+ boolean restrirctContextByChunks = textProcessingConfig.isIgnoreChunks();
+ int consumedIndex = state.getConsumedIndex();
if(token.inChunk != null && !textProcessingConfig.isIgnoreChunks() &&
restrirctContextByChunks){
- minIncludeIndex = Math.max(
- state.getConsumedIndex()+1,
- token.inChunk.getStartTokenIndex());
+ minIncludeIndex = token.inChunk.getStartTokenIndex();
+// minIncludeIndex = Math.max(
+// state.getConsumedIndex()+1,
+// token.inChunk.getStartTokenIndex());
maxIndcludeIndex = token.inChunk.getEndTokenIndex();
} else {
maxIndcludeIndex = state.getTokens().size() - 1;
- minIncludeIndex = state.getConsumedIndex() + 1;
+// minIncludeIndex = state.getConsumedIndex() + 1;
+ minIncludeIndex = 0;
}
- int prevIndex,pastIndex; //search away from the currently active token
+ int prevIndex = token.index;
+ int pastIndex = token.index;
+ int pastNonMatchable = 0;
+ int prevNonMatchable = 0;
int distance = 0;
- do {
- distance++;
- prevIndex = token.index-distance;
- pastIndex = token.index+distance;
- if(minIncludeIndex <= prevIndex){
- TokenData prevToken = state.getTokens().get(prevIndex);
- if(log.isDebugEnabled()){
- log.debug(" {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
- prevToken.isMatchable? '+':'-',prevToken.index,
- prevToken.getTokenText(), prevToken.getTokenLemma(),
- prevToken.isLinkable, prevToken.isMatchable
- });
+ do {
+ distance++;//keep track of the distance
+ //get the past token at the given distance (However ignore
+ //non AlphaNumeric tokens when calculating the distance)
+ pastIndex++;
+ TokenData pastToken = null;
+ while(pastToken == null && maxIndcludeIndex >= pastIndex &&
+ pastNonMatchable <= 1){
+ TokenData td = state.getTokens().get(pastIndex);
+ if(td.hasAlphaNumeric){
+ pastToken = td;
+ } else {
+ pastIndex++;
}
- if(prevToken.isMatchable){
- String prevSearchString = linkerConfig.isLemmaMatching() ?
- prevToken.getTokenLemma() : prevToken.getTokenText();
- if(prevSearchString == null){
- prevSearchString = prevToken.getTokenText();
- }
- searchStrings.add(0,prevSearchString);
+ }
+ //get the previous token at the given distance (However ignore
+ //non AlphaNumeric tokens when calculating the distance)
+ prevIndex--;
+ TokenData prevToken = null;
+ while(prevToken == null && minIncludeIndex <= prevIndex &&
+ //allow one nonMatchable token if prevIndex > the last
+ //consumed one and zero nonMatchable if prevIndex is <=
+ //the last consumed one
+ ((prevIndex > consumedIndex && prevNonMatchable <= 1) ||
+ prevIndex <= consumedIndex && prevNonMatchable < 1)){
+ TokenData td = state.getTokens().get(prevIndex);
+ if(td.hasAlphaNumeric){
+ prevToken = td;
+ } else {
+ prevIndex--;
}
}
- if(maxIndcludeIndex >= pastIndex){
- TokenData pastToken = state.getTokens().get(pastIndex);
+ //now that we know the tokens at this distance check if they are matchable
+ //Fist the past token
+ if(pastToken != null){
if(log.isDebugEnabled()){
log.debug(" {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
pastToken.isMatchable? '+':'-',pastToken.index,
@@ -180,27 +229,52 @@ public class EntityLinker {
});
}
if(pastToken.isMatchable){
- String pastSearchString = linkerConfig.isLemmaMatching() ?
- pastToken.getTokenLemma() : pastToken.getTokenText();
- if(pastSearchString == null){
- pastSearchString = pastToken.getTokenText();
- }
- searchStrings.add(pastSearchString);
+ searchStrings.add(pastToken);
+ } else {
+ pastNonMatchable++;
+ }
+ }
+ //Second in the previous token
+ if(prevToken != null){
+ if(log.isDebugEnabled()){
+ log.debug(" {} {}:'{}' (lemma: {}) linkable={}, matchable={}",new Object[]{
+ prevToken.isMatchable? '+':'-',prevToken.index,
+ prevToken.getTokenText(), prevToken.getTokenLemma(),
+ prevToken.isLinkable, prevToken.isMatchable
+ });
+ }
+ if(prevToken.isMatchable){
+ getSearchString(prevToken);
+ searchStrings.add(0,prevToken);
+ } else {
+ prevNonMatchable++;
}
}
} while(searchStrings.size() < linkerConfig.getMaxSearchTokens() && distance <
linkerConfig.getMaxSearchDistance() &&
- (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex));
+ (prevIndex > minIncludeIndex || pastIndex < maxIndcludeIndex) &&
+ (prevNonMatchable <= 1 || pastNonMatchable <= 1));
//we might have an additional element in the list
if(searchStrings.size() > linkerConfig.getMaxSearchTokens()){
searchStrings = searchStrings.subList( //the last part of the list
searchStrings.size()-linkerConfig.getMaxSearchTokens(),
searchStrings.size());
}
- log.debug(" >> searchStrings {}",searchStrings);
+ if(log.isDebugEnabled()){
+ List<String> list = new ArrayList<String>(searchStrings.size());
+ for(TokenData dt : searchStrings){
+ list.add(dt.token.getSpan());
+ }
+ log.debug(" >> searchStrings {}",list);
+ }
+ textProcessingStats.complete();
//search for Entities
List<Suggestion> suggestions = lookupEntities(searchStrings);
+ //Treat partial matches that do match more as the best FULL match
+ //differently
+ List<Suggestion> partialMatches = new ArrayList<Suggestion>();
if(!suggestions.isEmpty()){
+ rankingStats.begin();
//update the suggestions based on the best match
int bestMatchCount = suggestions.get(0).getLabelMatch().getMatchCount();
Iterator<Suggestion> it = suggestions.iterator();
@@ -211,6 +285,9 @@ public class EntityLinker {
int matchCount = suggestion.getLabelMatch().getMatchCount();
if(matchCount < bestMatchCount){
suggestion.setMatch(MATCH.PARTIAL);
+ } else if( matchCount > bestMatchCount){ //selects more tokens
+ partialMatches.add(suggestion); //but only a PARTIAL MATCH
+ it.remove(); //remove from the main suggestion list
}
//Filter matches with less than config.getMinFoundTokens()
//if matchcount is less than of the best match
@@ -219,19 +296,21 @@ public class EntityLinker {
it.remove();
} else { //calculate the score
//how good is the current match in relation to the best one
- double spanScore = matchCount/bestMatchCount;
+ double spanScore = matchCount >= bestMatchCount ? 1.0d :
+ matchCount/(double)bestMatchCount;
suggestion.setScore(spanScore*spanScore*suggestion.getLabelMatch().getMatchScore());
}
}
Suggestion oldBestRanked = suggestions.get(0); //for debugging
//resort by score
Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ Collections.sort(partialMatches, Suggestion.SCORE_COMPARATOR);
//this should never happen ... but the
//matchcount of the best match MUST NOT change
//after the sort by score!
if(bestMatchCount != suggestions.get(0).getLabelMatch().getMatchCount()){
log.warn("The match count for the top Ranked Suggestion for {} " +
- "changed after resorting based on Scores!",
+ "changed after resorting based on Scores!",
state.getTokenText(suggestions.get(0).getLabelMatch().getStart(),bestMatchCount));
log.warn(" originalbest : {}",oldBestRanked);
log.warn(" currnet ranking : {}",suggestions);
@@ -239,26 +318,8 @@ public class EntityLinker {
}
//adapt equals rankings based on the entity rank
if(linkerConfig.isRankEqualScoresBasedOnEntityRankings()){
- List<Suggestion> equalScoreList = new ArrayList<Suggestion>(4);
- double score = 2f;
- for(Suggestion s : suggestions){
- double actScore = s.getScore();
- if(score == actScore){
- equalScoreList.add(s);
- } else {
- if(equalScoreList.size() > 1){
- adaptScoresForEntityRankings(equalScoreList, actScore);
- }
- score = actScore;
- equalScoreList.clear();
- equalScoreList.add(s);
- }
- }
- if(equalScoreList.size() > 1){
- adaptScoresForEntityRankings(equalScoreList,0);
- }
- //resort by score
- Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ adaptScoresForEntityRankings(suggestions);
+ adaptScoresForEntityRankings(partialMatches);
}
//remove all suggestions > config.maxSuggestions
if(suggestions.size() > linkerConfig.getMaxSuggestions()){
@@ -277,7 +338,11 @@ public class EntityLinker {
for(Suggestion suggestion : suggestions){
processRedirects(suggestion);
}
+ for(Suggestion suggestion : partialMatches){
+ processRedirects(suggestion);
+ }
}
+ //create LinkedEntities for the main suggestions
int start = suggestions.get(0).getLabelMatch().getStart();
int span = suggestions.get(0).getLabelMatch().getSpan();
//Store the linking results
@@ -288,21 +353,83 @@ public class EntityLinker {
linkedEntity = new LinkedEntity(selectedText,
suggestions, getLinkedEntityTypes(suggestions.subList(0, 1)));
linkedEntities.put(selectedText, linkedEntity);
- }
+ } // else Assumption: The list of suggestions is the SAME
linkedEntity.addOccurrence(state.getSentence(),
//NOTE: The end Token is "start+span-1"
state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
- //set the next token to process to the next word after the
- //currently found suggestion
- state.setConsumed(start+span-1);
+ //In case of a FULL or EXACT MATCH we can set the next token to process to the next
+ //word after the currently found suggestion
+ if(suggestions.get(0).getMatch().ordinal() >= MATCH.FULL.ordinal()){
+ state.setConsumed(start+span-1);
+ }
+ //create LinkedEntities for partial matches
+ //TODO: maybe we need to group partial matches based on their
+ // selected Tokens and only group those suggestions that do
+ // select the same span in the Text. Currently all are grouped
+ // based on those that does select the most tokens.
+ if(!partialMatches.isEmpty()){
+ start = partialMatches.get(0).getLabelMatch().getStart();
+ span = partialMatches.get(0).getLabelMatch().getSpan();
+ selectedText = state.getTokenText(start, span);
+ linkedEntity = linkedEntities.get(selectedText);
+ if(linkedEntity == null){
+ linkedEntity = new LinkedEntity(selectedText,
+ partialMatches, getLinkedEntityTypes(suggestions.subList(0, 1)));
+ linkedEntities.put(selectedText, linkedEntity);
+ } // else Assumption: The list of suggestions is the SAME
+ linkedEntity.addOccurrence(state.getSentence(),
+ //NOTE: The end Token is "start+span-1"
+ state.getTokens().get(start).token, state.getTokens().get(start+span-1).token);
+ }
+ rankingStats.complete();
} // else suggestions are empty
if(linkingStateAware != null){
linkingStateAware.endToken(state.getToken().token);
}
+ textProcessingStats.begin();
}
+ textProcessingStats.cancel(); //do not count the last call
if(linkingStateAware != null && sentence != null){
linkingStateAware.endSection(sentence);
}
+ this.processingTime = System.currentTimeMillis()-startTime;
+ }
+ /**
+ * @param suggestions
+ */
+ private void adaptScoresForEntityRankings(List<Suggestion> suggestions) {
+ List<Suggestion> equalScoreList = new ArrayList<Suggestion>(4);
+ double score = 2f;
+ for(Suggestion s : suggestions){
+ double actScore = s.getScore();
+ if(score == actScore){
+ equalScoreList.add(s);
+ } else {
+ if(equalScoreList.size() > 1){
+ adaptScoreForEntityRankings(equalScoreList, actScore);
+ }
+ score = actScore;
+ equalScoreList.clear();
+ equalScoreList.add(s);
+ }
+ }
+ if(equalScoreList.size() > 1){
+ adaptScoreForEntityRankings(equalScoreList,0);
+ }
+ //resort by score
+ Collections.sort(suggestions, Suggestion.SCORE_COMPARATOR);
+ }
+ /**
+ * Helper that extracts the
+ * @param token
+ */
+ private String getSearchString(TokenData token) {
+ String searchString = linkerConfig.isLemmaMatching() ? token.getTokenLemma() :
+ token.getTokenText();
+ if(searchString == null){
+ searchString = token.getTokenText();
+ }
+ return searchString;
}
/**
* This method slightly adapts scores of Suggestions based on the Entity ranking.
@@ -315,7 +442,7 @@ public class EntityLinker {
* @param nextScore the score of the {@link Suggestion} with a lower score as the
* list of suggestions parsed in the first parameter
*/
- private void adaptScoresForEntityRankings(List<Suggestion> equalScoreList, double nextScore) {
+ private void adaptScoreForEntityRankings(List<Suggestion> equalScoreList, double nextScore) {
double score = equalScoreList.get(0).getScore();
log.debug(" > Adapt Score of multiple Suggestions "
+ "with '{}' based on EntityRanking",score);
@@ -400,7 +527,7 @@ public class EntityLinker {
Iterator<UriRef> redirects = result.getReferences(linkerConfig.getRedirectField());
switch (linkerConfig.getRedirectProcessingMode()) {
case ADD_VALUES:
- MGraph entityData = result.getData();
+ TripleCollection entityData = result.getData();
UriRef entityUri = result.getUri();
while(redirects.hasNext()){
UriRef redirect = redirects.next();
@@ -436,13 +563,13 @@ public class EntityLinker {
* Searches for Entities in the {@link #entitySearcher} corresponding to the
* {@link Token#getText() words} of the current {@link #state position} in
* the text.
- * @param searchStrings the list of {@link Token#getText() words} to search
+ * @param searchTokens the list of {@link Token#getText() words} to search
* entities for.
* @return The sorted list with the suggestions.
* If there are no suggestions an empty list will be returned.
* @throws EntitySearcherException
*/
- private List<Suggestion> lookupEntities(List<String> searchStrings) throws EntitySearcherException {
+ private List<Suggestion> lookupEntities(List<TokenData> searchTokens) throws EntitySearcherException {
Set<String> languages = new HashSet<String>();
languages.add(linkerConfig.getDefaultLanguage());
languages.add(state.getLanguage());
@@ -450,51 +577,153 @@ public class EntityLinker {
if(countryCodeIndex >= 2){
languages.add(state.getLanguage().substring(0,countryCodeIndex));
}
- Collection<? extends Entity> results;
- results = entitySearcher.lookup(linkerConfig.getNameField(),
- linkerConfig.getSelectedFields(),
- searchStrings,
- languages.toArray(new String[languages.size()]),
- lookupLimit);
- log.debug(" - found {} entities ...",results.size());
+ List<String> searchStrings = new ArrayList<String>(searchTokens.size());
+ for(Iterator<TokenData> it = searchTokens.iterator();it.hasNext();){
+ searchStrings.add(getSearchString(it.next()));
+ }
+ String[] languageArray = languages.toArray(new String[languages.size()]);
List<Suggestion> suggestions = new ArrayList<Suggestion>();
+ //perform the lookup with the parsed parameter
+ int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens);
+ //if no match where found in the result .. fallback to a search for the
+ //current token
+ if(suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1){
+ //there where results, but no one matched ...
+ // ... it is most likely a case where the used search terms are
+ // not releated. So try to query for the active token only
+ searchTokens = Collections.singletonList(state.getToken());
+ log.debug(" > No match for '{}' searchStrings ... ", searchStrings);
+ searchStrings = Collections.singletonList(state.getToken().token.getSpan());
+ log.debug(" ... fallback to search for active token '{}' ...",searchStrings);
+ performLookup(searchStrings, languageArray, suggestions, searchTokens);
+ }
+ //sort the suggestions
+ if(suggestions.size()>1){
+ Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
+ }
+ return suggestions;
+ }
+ /**
+ * @param searchStrings
+ * @param languageArray
+ * @param suggestions
+ * @param searchTokens
+ * @return
+ * @throws EntitySearcherException
+ */
+ private int performLookup(List<String> searchStrings, String[] languageArray,
+ List<Suggestion> suggestions, List<TokenData> searchTokens) throws EntitySearcherException {
+ int minProcessedResults = linkerConfig.getMaxSuggestions()*3;
+ int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions()*2*searchTokens.size());
+ int maxResults = lookupLimit*2;
+ int offset = 0;
+ int numFiltered = 0;
+ boolean moreResultsAvailable = true;
+ int numResults = 0;
+ //search for entities until
+ // (1) we have more as MAX_SUGGESTION results
+ // (2) no more results are available
+ // (3) the number of processed Entities is smaller as two times the
+ // suggestions
+ // (4) the number of requested Entities is smaller as two times the
+ // lookup limit.
+ //NOTE: making multiple requests can decrease the performance a lot.
+ // Because of that those limits assure that no more than two
+ // requests are made for the same lookup.
+ while(suggestions.size() < linkerConfig.getMaxSuggestions() &&
+ moreResultsAvailable && (numResults-numFiltered) < (minProcessedResults) &&
+ numResults < maxResults){
+ Collection<? extends Entity> results;
+ log.debug(" > request entities [{}-{}] entities ...",offset,(offset+lookupLimit));
+ lookupStats.begin(); //keep statistics
+ results = entitySearcher.lookup(linkerConfig.getNameField(),
+ linkerConfig.getSelectedFields(), searchStrings, languageArray,
+ lookupLimit, offset);
+ lookupStats.complete();
+ log.debug(" < found {} entities ...",results.size());
+ //queries might return more as the requested results
+ moreResultsAvailable = results.size() >= lookupLimit;
+ numResults = numResults + results.size();
+ offset = numResults;
+ matchingStats.begin();
+ numFiltered = numFiltered + processLookupResults(searchTokens, results, suggestions);
+ matchingStats.complete();
+ //sort the suggestions
+ }
+ return numResults;
+ }
+ /**
+ * Processes the parsed entity lookup results and adds suggestions to the
+ * parsed suggestion list
+ * @param results the results
+ * @param suggestions the suggestions
+ * @return the number of filtered results
+ */
+ private int processLookupResults(List<TokenData> searchTokens, Collection<? extends Entity> results, List<Suggestion> suggestions) {
+ int numFiltered = 0;
for(Entity result : results){
if(log.isDebugEnabled()){
log.debug(" > {} (ranking: {})",result.getId(),result.getEntityRanking());
}
- Suggestion suggestion = matchLabels(result);
- if(suggestion.getMatch() != MATCH.NONE){
- if(log.isDebugEnabled()){
- log.debug(" + {}",suggestion);
+ numQueryResults++;
+ //white/black list based entity type filtering (STANBOL-1111)
+ boolean filtered = false;
+ if(linkerConfig.isEntityTypeFilteringActive()){
+ filtered = filterEntity(result.getReferences(linkerConfig.getTypeField()));
+ }
+ if(!filtered){
+ Suggestion suggestion = matchLabels(searchTokens, result);
+ if(suggestion.getMatch() != MATCH.NONE){
+ if(log.isDebugEnabled()){
+ log.debug(" + {}",suggestion);
+ }
+ suggestions.add(suggestion);
+ } else {
+ log.debug(" - no match");
}
- suggestions.add(suggestion);
- } else {
- log.debug(" - no match");
+ } else {//do not process Entities with a filtered type
+ numFilteredResults++; //global statistics
+ numFiltered++;
}
}
- //sort the suggestions
- if(suggestions.size()>1){
- Collections.sort(suggestions,Suggestion.MATCH_TYPE_SUGGESTION_COMPARATOR);
+ return numFiltered;
+ }
+
+ public boolean filterEntity(Iterator<UriRef> entityTypes){
+ Map<UriRef, Integer> whiteList = linkerConfig.getWhitelistedTypes();
+ Map<UriRef, Integer> blackList = linkerConfig.getBlacklistedTypes();
+ Integer w = null;
+ Integer b = null;
+ while(entityTypes.hasNext()){
+ UriRef type = entityTypes.next();
+ Integer act = whiteList.get(type);
+ if(act != null){
+ if(w == null || act.compareTo(w) < 0){
+ w = act;
+ }
+ if(act.intValue() == 0){
+ break;
+ }
+ }
+ act = blackList.get(type);
+ if(act != null){
+ if(b == null || act.compareTo(b) < 0){
+ b = act;
+ }
+ if(act.intValue() == 0){
+ break;
+ }
+ }
+ }
+ if(w == null && b == null){
+ return !linkerConfig.isDefaultWhitelistTypes();
+ } else if(w != null){
+ return b == null || w.compareTo(b) < 0 ? false : true;
+ } else { //w == null && b != null
+ return true; //filter
}
- //TODO: Work in Progress feature ... allowing to refine search if no
- // suggestion is found but results where present
- // However this would need full limit/offset support for the
- // EntitySearcher. (rwesten 2012-05-21)
-// Integer maxResults = entitySearcher.getLimit();
-// if(maxResults == null){
-// maxResults = 1; //fall back to 1 if limit is not known
-// }
-// if(suggestions.isEmpty() && //if no suggestions where found
-// results.size() >= maxResults && //but the query had max results
-// //than the actual entity might not be within the first LIMIT results
-// searchStrings.size() > 1){ //if multiple words where used for the search
-// //try again with only a single word
-// suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0)));
-//
-// }
- //remove all elements > config.getMaxSuggestions()
- return suggestions;
}
+
/**
* Matches the labels of the parsed {@link Representation} with the Tokens of
* the texts (beginning with the currently active
@@ -511,30 +740,35 @@ public class EntityLinker {
* {@link EntitySearcher#getNameField()} property.
* @return The result of the matching.
*/
- private Suggestion matchLabels(Entity entity) {
- String curLang = state.getLanguage(); //language of the current sentence
- String defLang = linkerConfig.getDefaultLanguage(); //configured default language
- String mainLang;
- int countryCodeIndex = state.getLanguage() == null ? -1 : state.getLanguage().indexOf('-');
+ private Suggestion matchLabels(List<TokenData> searchTokens, Entity entity) {
+ String curLang = documentLang; //language of the current sentence
+ String defLang = defaultLang; //configured default language
+ String mainLang = documentMainLang;
Collection<PlainLiteral> mainLangLabels;
- if(countryCodeIndex >= 2){
- mainLang = state.getLanguage().substring(0,countryCodeIndex);
+ if(documentMainLang != null){
+ mainLang = documentMainLang;
mainLangLabels = new ArrayList<PlainLiteral>();
} else {
- mainLang = curLang;
+ mainLang = documentLang;
mainLangLabels = Collections.emptyList();
}
Iterator<PlainLiteral> labels = entity.getText(linkerConfig.getNameField());
Suggestion match = new Suggestion(entity);
Collection<PlainLiteral> defaultLabels = new ArrayList<PlainLiteral>();
boolean matchedLangLabel = false;
+ //avoid matching multiple labels with the exact same lexical.
+ Set<String> matchedLabels = new HashSet<String>();
while(labels.hasNext()){
PlainLiteral label = labels.next();
+ numLabels++;
String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
if((lang == null && curLang == null) ||
(lang != null && curLang != null && lang.equalsIgnoreCase(curLang))){
- matchLabel(match, label);
- matchedLangLabel = true;
+ if(!matchedLabels.contains(label.getLexicalForm())){
+ matchLabel(searchTokens, match, label);
+ matchedLabels.add(label.getLexicalForm());
+ matchedLangLabel = true;
+ }
} else if((lang == null && mainLang == null) ||
(lang != null && mainLang != null && lang.equalsIgnoreCase(mainLang))){
mainLangLabels.add(label);
@@ -546,8 +780,11 @@ public class EntityLinker {
//try to match main language labels
if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
for(PlainLiteral mainLangLabel : mainLangLabels){
- matchLabel(match, mainLangLabel);
- matchedLangLabel = true;
+ if(!matchedLabels.contains(mainLangLabel.getLexicalForm())){
+ matchLabel(searchTokens, match, mainLangLabel);
+ matchedLabels.add(mainLangLabel.getLexicalForm());
+ matchedLangLabel = true;
+ }
}
}
//use only labels in the default language if there is
@@ -555,7 +792,10 @@ public class EntityLinker {
// * no MATCH was found in the current language
if(!matchedLangLabel || match.getMatch() == MATCH.NONE){
for(PlainLiteral defaultLangLabel : defaultLabels){
- matchLabel(match, defaultLangLabel);
+ if(!matchedLabels.contains(defaultLangLabel.getLexicalForm())){
+ matchLabel(searchTokens, match, defaultLangLabel);
+ matchedLabels.add(defaultLangLabel.getLexicalForm());
+ }
}
}
return match;
@@ -565,7 +805,8 @@ public class EntityLinker {
* @param suggestion
* @param label
*/
- private void matchLabel(Suggestion suggestion, PlainLiteral label) {
+ private void matchLabel(List<TokenData> searchTokens, Suggestion suggestion, PlainLiteral label) {
+// test.begin();
String text = label.getLexicalForm();
String lang = label.getLanguage() == null ? null : label.getLanguage().toString();
if(!linkerConfig.isCaseSensitiveMatching()){
@@ -585,7 +826,9 @@ public class EntityLinker {
if(!hasAlphaNumericChar){
offset++;
} else if(offset > 0){
- unprocessedLabelTokens[i-offset] = unprocessedLabelTokens[i];
+ String token = unprocessedLabelTokens[i];
+ token = StringUtils.replaceChars(token,".","");
+ unprocessedLabelTokens[i-offset] = token;
}
}
String[] labelTokens;
@@ -595,13 +838,24 @@ public class EntityLinker {
labelTokens = new String[unprocessedLabelTokens.length-offset];
System.arraycopy(unprocessedLabelTokens, 0, labelTokens, 0, labelTokens.length);
}
- Set<String> labelTokenSet = new HashSet<String>(
- Arrays.asList(labelTokens));
+ //holds the tokens and their position within the label. NOTE that the same
+ //token may appear multiple times in the label (e.g. "Da Da Bing"
+ Map<String,List<Integer>> labelTokenMap = new HashMap<String, List<Integer>>();
+ for(int i=0;i < labelTokens.length; i++){
+ List<Integer> tokenIndexes = labelTokenMap.get(labelTokens[i]);
+ if(tokenIndexes == null){
+ tokenIndexes = new ArrayList<Integer>(2);
+ labelTokenMap.put(labelTokens[i], tokenIndexes);
+ }
+ tokenIndexes.add(Integer.valueOf(i));
+ }
+ NavigableMap<Integer, String> matchedLabelTokens = new TreeMap<Integer,String>();
int foundProcessableTokens = 0;
int foundTokens = 0;
float foundTokenMatch = 0;
//ensure the correct order of the tokens in the suggested entity
boolean search = true;
+ boolean activeTokenNotMatched = true;
int firstFoundIndex = -1;
int firstProcessableFoundIndex = -1;
int lastFoundIndex = -1;
@@ -612,6 +866,7 @@ public class EntityLinker {
String currentTokenText;
int currentTokenLength;
int notFound = 0;
+
int matchedTokensNotWithinProcessableTokenSpan = 0;
int foundTokensWithinCoveredProcessableTokens = 0;
float minTokenMatchFactor = linkerConfig.getMinTokenMatchFactor();
@@ -626,6 +881,8 @@ public class EntityLinker {
if(currentTokenText == null) { //no lemma available
currentTokenText = currentToken.getTokenText(); //fallback to text
}
+ //ignore '.' in tokens to ensure that 'D.C.' matches 'DC' ...
+ currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
@@ -647,14 +904,18 @@ public class EntityLinker {
matchFactor = matchCount/maxLength; //how good is the match
//remove matched labels from the set to disable them for
//a later random oder search
- labelTokenSet.remove(labelTokenText);
+ Integer labelTokenIndex = getLabelTokenIndex(labelTokenText, i, labelTokenMap);
+ matchedLabelTokens.put(labelTokenIndex, labelTokenText);
}
}
}
if(!found){
//search for a match in the wrong order
//currently only exact matches (for testing)
- if(found = labelTokenSet.remove(currentTokenText)){
+ Integer index = getLabelTokenIndex(currentTokenText, lastfoundLabelIndex+1, labelTokenMap);
+ if(index != null){
+ matchedLabelTokens.put(index, currentTokenText);
+ found = true;
matchFactor = 0.7f;
}
}
@@ -683,8 +944,15 @@ public class EntityLinker {
}
lastFoundIndex = currentIndex;
} else { //not found
+ if(state.getToken().index == currentToken.index){
+ //the currently active Token MUST BE matched
+ search = false;
+ activeTokenNotMatched = true;
+ }
notFound++;
- if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+ //stop forward search if
+// if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
+ if(!searchTokens.contains(currentToken)){
//stop as soon as a token that needs to be processed is
//not found in the label or the maximum number of tokens
//that are not processable are not found
@@ -699,10 +967,13 @@ public class EntityLinker {
int labelIndex = firstFoundLabelIndex-1;
notFound = 0;
matchedTokensNotWithinProcessableTokenSpan = 0;
- search = true;
- while(search && labelIndex >= 0 && currentIndex > state.getConsumedIndex()){
+ if(activeTokenNotMatched){ //do not search backwards if the active token
+ //was not found
+ search = true;
+ }
+ while(search && labelIndex >= 0 && currentIndex >= 0){// && currentIndex > state.getConsumedIndex()){
String labelTokenText = labelTokens[labelIndex];
- if(labelTokenSet.contains(labelTokenText)){ //still not matched
+ if(labelTokenMap.containsKey(labelTokenText)){ //still not matched
currentToken = state.getTokens().get(currentIndex);
currentTokenText = linkerConfig.isLemmaMatching() ?
currentToken.getTokenLemma() : currentToken.getTokenText();
@@ -712,6 +983,7 @@ public class EntityLinker {
if(!linkerConfig.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
}
+ currentTokenText = StringUtils.replaceChars(currentTokenText,".","");
currentTokenLength = currentTokenText.length();
boolean found = false;
float matchFactor = 0f;
@@ -745,7 +1017,8 @@ public class EntityLinker {
foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
firstFoundIndex = currentIndex;
labelIndex--;
- labelTokenSet.remove(labelTokenText);
+ Integer foundIndex = getLabelTokenIndex(labelTokenText, currentIndex, labelTokenMap);
+ matchedLabelTokens.put(foundIndex, labelTokenText);
} else {
notFound++;
if(currentToken.isMatchable || notFound > linkerConfig.getMaxNotFound()){
@@ -776,6 +1049,7 @@ public class EntityLinker {
if(linkerConfig.isCaseSensitiveMatching() ? currentText.equals(text) : currentText.equalsIgnoreCase(text)){
labelMatch = new LabelMatch(firstFoundIndex, coveredTokens, label);
} else {
+ int coveredLabelTokens = matchedLabelTokens.lastKey().intValue()-matchedLabelTokens.firstKey().intValue()+1;
if(foundTokens == labelTokens.length && foundTokens == coveredTokens){
//if all token matched set found to covered: May be lower because only
//processable tokens are counted, but FULL also checks
@@ -785,7 +1059,7 @@ public class EntityLinker {
}
labelMatch = new LabelMatch(firstProcessableFoundIndex, coveredProcessableTokens,
foundProcessableTokens,foundTokensWithinCoveredProcessableTokens,
- foundTokenMatch/foundTokens,label,labelTokens.length);
+ foundTokenMatch/(float)foundTokens,label,labelTokens.length, coveredLabelTokens);
}
if(labelMatch.getLabelScore() >= linkerConfig.getMinLabelScore() &&
labelMatch.getTextScore() >= linkerConfig.getMinTextScore() &&
@@ -793,6 +1067,55 @@ public class EntityLinker {
suggestion.addLabelMatch(labelMatch);
}
} //else NO tokens found -> nothing to do
+// test.complete();
+ }
+ /**
+ * Utility Method that searches for the Index of the parsed label token text
+ * within the labelTokenMap. Matched tokens are removed from the parsed
+ * LabelTokenMap <p>
+ * NOTE: This is necessary, because in cases where Labels do contain the same
+ * token twice, it might not be always clear which token is the matching one.
+ * Especially if the order of the Tokens in the Text does not exactly match
+ * the order within the Label. This Method tries always to find the matching
+ * token closest to the parsed currentIndex.
+ * It iterates backwards to prefer Tokens that occur later as the current index
+ * in the tokenized label.
+ * @param labelTokenText the text of the current labelToken
+ * @param currentIndex the current index of the processing (or if not known
+ * the last matched index of an token within the label
+ * @param labelTokenMap the Map holding tokens as key and a list of occurrences
+ * as values or <code>null</code> if no Token with the parsed labelTokenText
+ * was present as key in the parsed labelTokenMap
+ * @return the index of the selected label token
+ */
+ private Integer getLabelTokenIndex(String labelTokenText, int currentIndex,
+ Map<String,List<Integer>> labelTokenMap) {
+ List<Integer> tokenIndexes = labelTokenMap.get(labelTokenText);
+ if(tokenIndexes == null){
+ return null;
+ }
+ //try to remove the closest index in the map
+ Integer labelTokenIndex = Integer.valueOf(currentIndex);
+ //search the closest position
+ int closest = Integer.MAX_VALUE;
+ int closestIndex = -1;
+ for(int p = tokenIndexes.size()-1; p >= 0; p--){
+ Integer index = tokenIndexes.get(p);
+ int dif = Math.abs(index.intValue()-currentIndex);
+ if(Math.abs(index.intValue()-currentIndex) < closest){
+ closest = dif;
+ closestIndex = p;
+ labelTokenIndex = index;
+ if(closest == 0){
+ break;
+ }
+ }
+ }
+ tokenIndexes.remove(closestIndex);
+ if(tokenIndexes.isEmpty()){
+ labelTokenMap.remove(labelTokenText);
+ }
+ return labelTokenIndex;
}
/**
* Compares to token with each other and returns the longest match. The
@@ -834,5 +1157,34 @@ public class EntityLinker {
}
return f > b ? f : b;
}
+ /**
+ * This logs the statistics about the processing process
+ * @param log the logger used to log the statistics
+ */
+ public void logStatistics(Logger log){
+ log.info("EntityLinking Statistics:");
+ double textProcessingDuration = textProcessingStats.getDuration();
+ double lookupDuration = lookupStats.getDuration();
+ double matchingDuration = matchingStats.getDuration();
+ double rankingDuration = rankingStats.getDuration();
+ double other = processingTime-textProcessingDuration-lookupDuration-matchingDuration;
+ log.info(" - overal: {}ms (text processing: {}%, lookup: {}%, matching {}%, ranking {}%, other {}%)", new Object[]{
+ processingTime,
+ Math.round(textProcessingDuration*100/(double)processingTime),
+ Math.round(lookupDuration*100/(double)processingTime),
+ Math.round(matchingDuration*100/(double)processingTime),
+ Math.round(rankingDuration*100/(double)processingTime),
+ Math.round(other*100/(double)processingTime),
+ });
+ textProcessingStats.printStatistics(log);
+ lookupStats.printStatistics(log);
+ log.info(" - {} query results ({} filtered - {}%)",
+ new Object[]{numQueryResults,numFilteredResults,
+ numFilteredResults*100f/(float)numQueryResults});
+ matchingStats.printStatistics(log);
+ rankingStats.printStatistics(log);
+// test.printStatistics(log);
+// test2.printStatistics(log);
+ }
}
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java Tue Jun 25 06:47:12 2013
@@ -57,10 +57,10 @@ public class LabelMatch {
* @param span
*/
protected LabelMatch(int start, int span, PlainLiteral label){
- this(start,span,span,span,1f,label,span);
+ this(start,span,span,span,1f,label,span,span);
}
- protected LabelMatch(int start, int span,int processableMatchCount, int matchCount, float tokenMatchScore,PlainLiteral label,int labelTokenCount){
+ protected LabelMatch(int start, int span,int processableMatchCount, int matchCount, float tokenMatchScore,PlainLiteral label,int labelTokenCount, int coveredLabelTokenCount){
if(start < 0){
throw new IllegalArgumentException("parsed start position MUST BE >= 0!");
}
@@ -75,7 +75,7 @@ public class LabelMatch {
this.label = label;
if(processableMatchCount <= 0){
match = MATCH.NONE;
- } else if(processableMatchCount == span){
+ } else if(processableMatchCount == span && matchCount == coveredLabelTokenCount){
match = MATCH.FULL;
} else {
match = MATCH.PARTIAL;
@@ -88,9 +88,9 @@ public class LabelMatch {
this.labelTokenCount = labelTokenCount;
//init scores();
double suggestionMatchScore = matchCount*this.tokenMatchScore;
- textScore = suggestionMatchScore/this.span;
- labelScore = suggestionMatchScore/this.labelTokenCount;
- score = textScore*labelScore;
+ textScore = suggestionMatchScore/(double)this.span;
+ labelScore = suggestionMatchScore/(double)this.labelTokenCount;
+ score = textScore * labelScore;
if(span < processableMatchCount){
throw new IllegalArgumentException("The span '" + span
+ "' MUST BE >= then number of matched processable tokens'"
@@ -207,12 +207,21 @@ public class LabelMatch {
public static final Comparator<LabelMatch> DEFAULT_LABEL_TOKEN_COMPARATOR = new Comparator<LabelMatch>() {
@Override
public int compare(LabelMatch arg0, LabelMatch arg1) {
- if(arg0.match == MATCH.NONE || arg1.match == MATCH.NONE ||
- arg0.processableMatchCount == arg1.processableMatchCount){
- return arg1.match.ordinal() - arg0.match.ordinal(); //higher ordinal first
- } else {
+ if(arg0.match.ordinal() >= MATCH.FULL.ordinal() && //for FULL or EXACT matches
+ arg1.match.ordinal() >= MATCH.FULL.ordinal()){
+ return arg1.processableMatchCount - arg0.processableMatchCount; //bigger should be first
+ } else if(arg0.match == arg1.match){ //also if the MATCH type is equals
return arg1.processableMatchCount - arg0.processableMatchCount; //bigger should be first
+ } else { //sort by MATCH type
+ return arg1.match.ordinal() - arg0.match.ordinal(); //higher ordinal first
}
+// OLD IMPLEMENTATION
+// if(arg0.match == MATCH.NONE || arg1.match == MATCH.NONE ||
+// arg0.processableMatchCount == arg1.processableMatchCount){
+// return arg1.match.ordinal() - arg0.match.ordinal(); //higher ordinal first
+// } else {
+// return arg1.processableMatchCount - arg0.processableMatchCount; //bigger should be first
+// }
}
};
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java Tue Jun 25 06:47:12 2013
@@ -34,6 +34,7 @@ import java.util.Locale;
import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.iterators.FilterIterator;
+import org.apache.stanbol.commons.namespaceprefix.service.StanbolNamespacePrefixService;
import org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig;
import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
@@ -710,6 +711,7 @@ public class ProcessingState {
}
morpho = mf;
}
+
}
/**
Added: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java?rev=1496359&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java (added)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java Tue Jun 25 06:47:12 2013
@@ -0,0 +1,81 @@
+package org.apache.stanbol.enhancer.engines.entitylinking.impl;
+
+import org.slf4j.Logger;
+
+public class Statistic {
+
+ private final String name;
+ int count = 0;
+ int closedCount = 0;
+ boolean started = false;
+ long start;
+ long duration;
+ long max = -1;
+ long min = Long.MAX_VALUE;
+ private final int numPrint;
+ private final Logger log;
+
+ public Statistic(String name){
+ this(name,-1,null);
+ }
+ public Statistic(String name, int numPrint, Logger log){
+ this.name = name;
+ this.numPrint = numPrint;
+ if(numPrint > 0){
+ this.log = log;
+ } else {
+ this.log = null;
+ }
+ }
+ public double getDuration(){
+ return this.duration/1000000.0;
+ }
+ public void begin(){
+ count++;
+ started = true;
+ start = System.nanoTime();
+ }
+ public void cancel(){
+ count--;
+ started = false;
+ }
+ public void complete(){
+ if(started){
+ long end = System.nanoTime();
+ closedCount++;
+ long dif = (end - start);
+ duration = duration + dif;
+ if(dif > max) {
+ max = dif;
+ } else if(dif < min){
+ min = dif;
+ }
+ started = false;
+ } //else close without start ... ignore
+ if(log != null && numPrint > 0){
+ if(count % numPrint == 0){
+ printStatistics(log);
+ }
+ }
+ }
+
+ public String getStatistics(){
+ int count = this.count;
+ int closedCount = this.closedCount;
+ double duration = this.duration/1000000.0;
+ double max = this.max/1000000.0;
+ double min = this.min/1000000.0;
+ StringBuilder sb = new StringBuilder(name).append(": ");
+ sb.append(duration).append("ms [");
+ sb.append("count: ").append(count).append(" | ");
+ sb.append("time: ").append(duration/(double)closedCount).append("ms (max:");
+ sb.append(max).append(", min:").append(min).append(")]");
+ return sb.toString();
+ }
+
+ public void printStatistics(Logger log){
+ log.info(" - {}", getStatistics());
+
+ }
+
+}
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java Tue Jun 25 06:47:12 2013
@@ -210,18 +210,30 @@ public class EntityLinkingEngineTest {
at.addToken(start+8,start+16).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NE",Pos.CommonNoun),1d));
at.addToken(start+17,start+21).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NUM",Pos.Numeral),1d));
at.addToken(start+21,start+22).addAnnotation(POS_ANNOTATION, Value.value(new PosTag(")",Pos.CloseBracket),1d));
-
+
+ at.addToken(start+23, start+26).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("O",LexicalCategory.Adjective)));
+ at.addToken(start+27, start+28).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("A", LexicalCategory.Adposition)));
+
start = TEST_TEXT.indexOf("geologist");
at.addToken(start,start+9).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NE",Pos.CommonNoun),1d));
-
+
+ at.addToken(start+10, start+13).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("O", LexicalCategory.Adjective)));
+ at.addToken(start+14, start+19).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb)));
+ at.addToken(start+20, start+22).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("PP", LexicalCategory.PronounOrDeterminer)));
+
start = TEST_TEXT.indexOf("New Zealand");
at.addToken(start,start+3).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NE",Pos.CommonNoun),1d));
at.addToken(start+4,start+11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP",Pos.ProperNoun),1d));
+ //add filler Tokens for "and worked at"
+ at.addToken(start+12, start+15).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("O", LexicalCategory.Adjective)));
+ at.addToken(start+16, start+22).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("V", LexicalCategory.Verb)));
+ at.addToken(start+23, start+25).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("PP", LexicalCategory.PronounOrDeterminer)));
+
start = TEST_TEXT.indexOf("the University of Otago");
at.addToken(start,start+3).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("ART",Pos.Article),1d));
at.addToken(start+4,start+14).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NE",Pos.CommonNoun),1d));
- at.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("OF",LexicalCategory.PronounOrDeterminer),1d));
+ at.addToken(start+15,start+17).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("OF",Pos.Preposition),1d));
at.addToken(start+18,start+23).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP",Pos.ProperNoun),1d));
at.addToken(start+23,start+24).addAnnotation(POS_ANNOTATION, Value.value(new PosTag(".",Pos.Point),1d));
}
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java Tue Jun 25 06:47:12 2013
@@ -22,6 +22,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -75,7 +76,7 @@ public class TestSearcherImpl implements
}
@Override
- public Entity get(UriRef id, Set<UriRef> includeFields) throws IllegalStateException {
+ public Entity get(UriRef id, Set<UriRef> includeFields, String...lanuages) throws IllegalStateException {
return entities.get(id);
}
@@ -83,11 +84,11 @@ public class TestSearcherImpl implements
public Collection<? extends Entity> lookup(UriRef field,
Set<UriRef> includeFields,
List<String> search,
- String[] languages,Integer numResults) throws IllegalStateException {
+ String[] languages,Integer numResults, Integer offset) throws IllegalStateException {
if(field.equals(nameField)){
//we do not need sorting
//Representation needs to implement equals, therefore results filters multiple matches
- Set<Entity> results = new HashSet<Entity>();
+ Set<Entity> results = new LinkedHashSet<Entity>();
for(String term : search){
//TODO: adding 'zzz' to the parsed term is no good solution for
// searching ...
@@ -95,7 +96,16 @@ public class TestSearcherImpl implements
results.addAll(termResults);
}
}
- return results;
+ List<Entity> resultList = new ArrayList<Entity>(results);
+ if(offset != null && offset.intValue() > 0){
+ if(offset.intValue() > results.size()){
+ return Collections.emptyList();
+ } else {
+ return resultList.subList(offset, results.size());
+ }
+ } else {
+ return results;
+ }
} else {
throw new IllegalStateException("Lookup is only supported for the nameField '"+
nameField+"' parsed to the constructor");