You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/12/04 08:39:05 UTC

svn commit: r1547721 - /stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java

Author: rwesten
Date: Wed Dec  4 07:39:05 2013
New Revision: 1547721

URL: http://svn.apache.org/r1547721
Log:
STANBOL-1230: merged LookupCache support to the 0.12 releasing branch

Modified:
    stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java

Modified: stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java?rev=1547721&r1=1547720&r2=1547721&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java Wed Dec  4 07:39:05 2013
@@ -36,7 +36,6 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.lang.LocaleUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
@@ -78,7 +77,7 @@ public class EntityLinker {
 
     private LinkingStateAware linkingStateAware;
 
-    private int minSearchResults;
+    final int minSearchResults;
     
     //Language configuration
     final String documentLang;
@@ -87,15 +86,18 @@ public class EntityLinker {
     
     private Statistic textProcessingStats = new Statistic("Text Processing");
     private Statistic lookupStats = new Statistic("Vocabulary Lookup");
+    private int cacheHits = 0;
     private int numQueryResults = 0;
     private int numFilteredResults = 0;
     private Statistic matchingStats = new Statistic("Label Matching");
     private Statistic rankingStats = new Statistic("Suggestion Ranking");
 //    private Statistic test = new Statistic("test1");
 //    private Statistic test2_ = new Statistic("test2");
-    private int numLabels = 0;
+//    private int numLabels = 0;
     private long processingTime = -1;
 
+    private HashMap<List<String>,List<Entity>> lookupCache;
+
 
     public EntityLinker(AnalysedText analysedText, String language,
                         LanguageProcessingConfig textProcessingConfig,
@@ -110,6 +112,7 @@ public class EntityLinker {
                 EntityLinkerConfig linkerConfig,
                 LabelTokenizer labelTokenizer, LinkingStateAware linkingStateAware) {
         //this.analysedText = analysedText;
+        this.lookupCache = new HashMap<List<String>,List<Entity>>();
         this.entitySearcher = entitySearcher;
         this.linkerConfig = linkerConfig;
         this.textProcessingConfig = textProcessingConfig;
@@ -584,19 +587,44 @@ public class EntityLinker {
         }
         String[] languageArray = languages.toArray(new String[languages.size()]);
         List<Suggestion> suggestions = new ArrayList<Suggestion>();
-        //perform the lookup with the parsed parameter
-        int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens);
-        //if no match where found in the result .. fallback to a search for the
-        //current token
-        if(suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1){
-            //there where results, but no one matched ...
-            //   ... it is most likely a case where the used search terms are
-            //       not releated. So try to query for the active token only
-            searchTokens = Collections.singletonList(state.getToken());
-            log.debug("   > No match for '{}' searchStrings ... ", searchStrings);
-            searchStrings = Collections.singletonList(state.getToken().token.getSpan());
-            log.debug("     ... fallback to search for active token '{}' ...",searchStrings);
-            performLookup(searchStrings, languageArray, suggestions, searchTokens);
+        //check if we have the search strings in the cache
+        List<Entity> results = lookupCache.get(searchStrings);
+        if(results != null){ //query is cached
+            cacheHits++;
+            //match the cached results
+            for(Entity result : results){
+                processLookupResult(searchTokens, result, suggestions);
+            }
+        } else { // we need to perform a new query
+            results = new ArrayList<Entity>();
+            //perform the lookup with the parsed parameter
+            int numResults = performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
+            //cache the results
+            lookupCache.put(searchStrings, results);
+            //if no match where found in the result .. fallback to a search for the
+            //current token
+            if(suggestions.isEmpty() && numResults > 0 && searchStrings.size() > 1){
+                //there where results, but no one matched ...
+                //   ... it is most likely a case where the used search terms are
+                //       not releated. So try to query for the active token only
+                log.debug("   > No match for '{}' searchStrings ... ", searchStrings);
+                searchStrings = Collections.singletonList(getSearchString(state.getToken()));
+                searchTokens = Collections.singletonList(state.getToken());
+                results = lookupCache.get(searchStrings);
+                if(results != null){ //query is cached
+                    cacheHits++;
+                    //match the cached results
+                    for(Entity result : results){
+                        processLookupResult(searchTokens, result, suggestions);
+                    }
+                } else {
+                    results = new ArrayList<Entity>();
+                    log.debug("     ... fallback to search for active token '{}' ...",searchStrings);
+                    performLookup(searchStrings, languageArray, suggestions, searchTokens, results);
+                    //cache the results of the fall-back query
+                    lookupCache.put(searchStrings, results);
+                }
+            }
         }
         //sort the suggestions
         if(suggestions.size()>1){
@@ -609,11 +637,16 @@ public class EntityLinker {
      * @param languageArray
      * @param suggestions
      * @param searchTokens
+     * @param queryResults the unprocessed results of the query for the parsed
+     * parameters. This is used to cache results of queries. This avoid issuing
+     * the same query twice for a analysed document.
+     * string.
      * @return
      * @throws EntitySearcherException
      */
     private int performLookup(List<String> searchStrings, String[] languageArray,
-            List<Suggestion> suggestions, List<TokenData> searchTokens) throws EntitySearcherException {
+            List<Suggestion> suggestions, List<TokenData> searchTokens, 
+            List<Entity> queryResults) throws EntitySearcherException {
         int minProcessedResults = linkerConfig.getMaxSuggestions()*3;
         int lookupLimit = Math.max(MIN_SEARCH_LIMIT, linkerConfig.getMaxSuggestions()*2*searchTokens.size());
         int maxResults = lookupLimit*2;
@@ -647,7 +680,23 @@ public class EntityLinker {
             numResults = numResults + results.size();
             offset = numResults;
             matchingStats.begin();
-            numFiltered = numFiltered + processLookupResults(searchTokens, results, suggestions);
+            for(Entity result : results){ 
+                if(log.isDebugEnabled()){
+                    log.debug("    > {} (ranking: {})",result.getId(),result.getEntityRanking());
+                }
+                numQueryResults++;
+                //white/black list based entity type filtering (STANBOL-1111)
+                if(!linkerConfig.isEntityTypeFilteringActive() || 
+                        !filterEntity(result.getReferences(linkerConfig.getTypeField()))){
+                    //a valid query result
+                    queryResults.add(result);
+                    //now match the result against the current position in the text
+                    processLookupResult(searchTokens, result, suggestions);
+                } else { //do not process Entities with a filtered type
+                    numFilteredResults++; //global statistics
+                    numFiltered++;
+                }
+            }
             matchingStats.complete();
             //sort the suggestions
         }
@@ -656,38 +705,20 @@ public class EntityLinker {
     /**
      * Processes the parsed entity lookup results and adds suggestions to the
      * parsed suggestion list
-     * @param results the results
+     * @param result the result to process
      * @param suggestions the suggestions
      * @return the number of filtered results
      */
-    private int processLookupResults(List<TokenData> searchTokens, Collection<? extends Entity> results, List<Suggestion> suggestions) {
-        int numFiltered = 0;
-        for(Entity result : results){ 
+    private void processLookupResult(List<TokenData> searchTokens, Entity result, List<Suggestion> suggestions) {
+        Suggestion suggestion = matchLabels(searchTokens, result);
+        if(suggestion.getMatch() != MATCH.NONE){
             if(log.isDebugEnabled()){
-                log.debug("    > {} (ranking: {})",result.getId(),result.getEntityRanking());
-            }
-            numQueryResults++;
-            //white/black list based entity type filtering (STANBOL-1111)
-            boolean filtered = false;
-            if(linkerConfig.isEntityTypeFilteringActive()){
-                filtered = filterEntity(result.getReferences(linkerConfig.getTypeField()));
-            }
-            if(!filtered){
-                Suggestion suggestion = matchLabels(searchTokens, result);
-                if(suggestion.getMatch() != MATCH.NONE){
-                    if(log.isDebugEnabled()){
-                        log.debug("      + {}",suggestion);
-                    }
-                    suggestions.add(suggestion);
-                } else {
-                    log.debug("      - no match");
-                }
-            } else {//do not process Entities with a filtered type
-                numFilteredResults++; //global statistics
-                numFiltered++;
+                log.debug("      + {}",suggestion);
             }
+            suggestions.add(suggestion);
+        } else {
+            log.debug("      - no match");
         }
-        return numFiltered;
     }
     
     public boolean filterEntity(Iterator<UriRef> entityTypes){
@@ -761,7 +792,7 @@ public class EntityLinker {
         Set<String> matchedLabels = new HashSet<String>();
         while(labels.hasNext()){
             PlainLiteral label = labels.next();
-            numLabels++;
+            //numLabels++;
             String lang = label.getLanguage() != null ? label.getLanguage().toString() : null;
             String text = label.getLexicalForm();
             //if case-insensitive matching ... compare lower case versions
@@ -1245,6 +1276,9 @@ public class EntityLinker {
         });
         textProcessingStats.printStatistics(log);
         lookupStats.printStatistics(log);
+        float cacheHitPercentage = lookupStats.count > 0 ? //avoid division by zero
+                cacheHits*100f/(float)lookupStats.count : Float.NaN;
+        log.info("    - cache hits: {} ({}%)",cacheHits,cacheHitPercentage);
         log.info("      - {} query results ({} filtered - {}%)",
             new Object[]{numQueryResults,numFilteredResults, 
                 numFilteredResults*100f/(float)numQueryResults});