You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/05/21 14:36:25 UTC

svn commit: r1340995 - in /incubator/stanbol/trunk/enhancer/engines/keywordextraction/src: main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/ main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/ main/java/org/ap...

Author: rwesten
Date: Mon May 21 12:36:24 2012
New Revision: 1340995

URL: http://svn.apache.org/viewvc?rev=1340995&view=rev
Log:
STANBOL-622: The KeywordLinkingEngine now checks if all Tokens (words) of an entity label are matched within a text before marking a Suggestion as FULL. All suggestions that do not match all Tokens of the label are now considered as Partial.

other:

* added support for setting the limit (maximal number of returned Representations) to the EntitySearcher interface. This would allow to make this configureable (not yet implemented)

Modified:
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Mon May 21 12:36:24 2012
@@ -832,10 +832,11 @@ public class KeywordLinkingEngine 
             throw new ConfigurationException(REFERENCED_SITE_ID,
                     "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
         }
+        //TODO: make limit configurable!
         if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
-            entitySearcher = new EntityhubSearcher(context.getBundleContext());
+            entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
         } else {
-            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId);
+            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId,10);
         }
     }
     /**

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Mon May 21 12:36:24 2012
@@ -284,6 +284,22 @@ public class EntityLinker {
         if(suggestions.size()>1){
             Collections.sort(suggestions,Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
         }
+        //TODO: Work in Progress feature ... allowing to refine search if no
+        //      suggestion is found but results where present
+        //      However this would need full limit/offset support for the
+        //      EntitySearcher. (rwesten 2012-05-21)
+//        Integer maxResults = entitySearcher.getLimit();
+//        if(maxResults == null){
+//            maxResults = 1; //fall back to 1 if limit is not known
+//        }
+//        if(suggestions.isEmpty() && //if no suggestions where found
+//                results.size() >= maxResults && //but the query had max results
+//                //than the actual entity might not be within the first LIMIT results
+//                searchStrings.size() > 1){ //if multiple words where used for the search
+//            //try again with only a single word
+//            suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0)));
+//            
+//        }
         //remove all elements > config.getMaxSuggestions()
         return suggestions;
     }
@@ -466,6 +482,7 @@ public class EntityLinker {
             String labelTokenText = labelTokens[labelIndex];
             if(labelTokenSet.remove(labelTokenText)){ //still not matched
                 currentToken = state.getSentence().getTokens().get(currentIndex);
+                boolean isProcessable = isProcessableToken(currentToken);
                 currentTokenText = currentToken.getText();
                 if(!config.isCaseSensitiveMatching()){
                     currentTokenText = currentTokenText.toLowerCase();
@@ -484,13 +501,16 @@ public class EntityLinker {
                     }
                 }
                 if(found){ //found
+                    if(isProcessable){
+                        foundProcessableTokens++; //only count processable Tokens
+                    }
                     foundTokens++;
                     foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
                     firstFoundIndex = currentIndex;
                     currentIndex --;
                 } else {
                     notFound++;
-                    if(notFound > maxNotFound){
+                    if(isProcessable || notFound > maxNotFound){
                         //stop as soon as a token that needs to be processed is
                         //not found in the label or the maximum number of tokens
                         //that are not processable are not found
@@ -519,9 +539,15 @@ public class EntityLinker {
                 //of non-processable!
                 foundTokens = coveredTokens;
             } else if((foundProcessableTokens >= config.getMinFoundTokens() ||
-                    foundTokens == coveredTokens) && 
+                    //NOTE (rwesten, 2012-05-21): Do not check if all covered
+                    //  Tokens are found, but if all Tokens of the Label are
+                    //  matched! (STANBOL-622)
+                    //foundTokens == coveredTokens) && 
+                    foundTokens >= labelTokens.length) &&
                     labelMatchScore >= 0.6f){
-                if(foundTokens == coveredTokens){
+                //same as above
+                //if(foundTokens == coveredTokens){
+                if(foundTokens == labelTokens.length && foundTokenMatch == coveredTokens){
                     labelMatch = MATCH.FULL;
                 } else {
                     labelMatch = MATCH.PARTIAL;

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Mon May 21 12:36:24 2012
@@ -64,4 +64,11 @@ public interface EntitySearcher {
      * @return the state
      */
     boolean supportsOfflineMode();
+    
+    /**
+     * The maximum number of {@link Representation}s returned for {@link #lookup(String, Set, List, String...)}
+     * queries
+     * @return the Number or <code>null</code> if not known
+     */
+    Integer getLimit();
 }
\ No newline at end of file

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java Mon May 21 12:36:24 2012
@@ -110,12 +110,14 @@ public class Suggestion implements Compa
                 throw new IllegalArgumentException("For "+match+" matches the token span and count MUST BE > 0");
             }
             if(match == MATCH.PARTIAL){
-                if(span <= count){
-                    throw new IllegalArgumentException("For "+match+" matches the token span MUST BE > than the token count!");
+                if(span <= count && labelTokenCount <= count){
+                    throw new IllegalArgumentException("For "+match+" matches the (token span OR label token count) MUST BE > than the token count!");
                 }
             } else {
                 if(span != count){
-                    throw new IllegalArgumentException("For "+match+" matches the token span MUST BE equals to the token count!");
+                    throw new IllegalArgumentException("For "+match+" matches the token span '"
+                            +span+"' MUST BE equals to the token count '"+count+"' (label: '"
+                            +label.getText()+"')!");
                 }
             }
         }

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java Mon May 21 12:36:24 2012
@@ -33,8 +33,11 @@ import org.osgi.framework.BundleContext;
 
 public final class EntityhubSearcher extends TrackingEntitySearcher<Entityhub> implements EntitySearcher {
     
-    public EntityhubSearcher(BundleContext context) {
+    private final Integer limit;
+
+    public EntityhubSearcher(BundleContext context, Integer limit) {
         super(context,Entityhub.class,null);
+        this.limit = limit != null && limit > 0 ? limit : null;
     }
     
     @Override
@@ -81,4 +84,9 @@ public final class EntityhubSearcher ext
         return true; //the entityhub is always offline
     }
 
+    @Override
+    public Integer getLimit() {
+        return limit;
+    }
+
 }

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java Mon May 21 12:36:24 2012
@@ -35,12 +35,15 @@ import org.osgi.framework.BundleContext;
 
 public final class ReferencedSiteSearcher extends TrackingEntitySearcher<ReferencedSite> implements EntitySearcher {
     
+    
     private final String siteId;
+    private final Integer limit;
 
-    public ReferencedSiteSearcher(BundleContext context,String siteId) {
+    public ReferencedSiteSearcher(BundleContext context,String siteId, Integer limit) {
         super(context, ReferencedSite.class, 
             Collections.singletonMap(SiteConfiguration.ID,siteId));
         this.siteId = siteId;
+        this.limit = limit != null && limit > 0 ? limit : null;
     }
     
     @Override
@@ -74,6 +77,9 @@ public final class ReferencedSiteSearche
         }
         FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(), 
             field, includeFields, search, languages);
+        if(limit != null){
+            query.setLimit(limit);
+        }
         QueryResultList<Representation> results;
         try {
             results = site.find(query);
@@ -91,4 +97,9 @@ public final class ReferencedSiteSearche
         //Do not throw an exception here if the site is not available. Just return false
         return site == null ? false : site.supportsLocalMode();
     }
+
+    @Override
+    public Integer getLimit() {
+        return limit;
+    }
 }

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Mon May 21 12:36:24 2012
@@ -98,4 +98,10 @@ public class TestSearcherImpl implements
         return true;
     }
 
+
+    @Override
+    public Integer getLimit() {
+        return null;
+    }
+
 }