You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/05/21 14:36:25 UTC
svn commit: r1340995 - in
/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src:
main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/
main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/
main/java/org/ap...
Author: rwesten
Date: Mon May 21 12:36:24 2012
New Revision: 1340995
URL: http://svn.apache.org/viewvc?rev=1340995&view=rev
Log:
STANBOL-622: The KeywordLinkingEngine now checks if all Tokens (words) of an entity label are matched within a text before marking a Suggestion as FULL. All suggestions that do not match all Tokens of the label are now considered as Partial.
other:
* added support for setting the limit (maximal number of returned Representations) to the EntitySearcher interface. This would allow to make this configureable (not yet implemented)
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Mon May 21 12:36:24 2012
@@ -832,10 +832,11 @@ public class KeywordLinkingEngine
throw new ConfigurationException(REFERENCED_SITE_ID,
"The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!");
}
+ //TODO: make limit configurable!
if(Entityhub.ENTITYHUB_IDS.contains(refSiteId.toLowerCase())){
- entitySearcher = new EntityhubSearcher(context.getBundleContext());
+ entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
} else {
- entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId);
+ entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),refSiteId,10);
}
}
/**
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Mon May 21 12:36:24 2012
@@ -284,6 +284,22 @@ public class EntityLinker {
if(suggestions.size()>1){
Collections.sort(suggestions,Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
}
+ //TODO: Work in Progress feature ... allowing to refine search if no
+ // suggestion is found but results where present
+ // However this would need full limit/offset support for the
+ // EntitySearcher. (rwesten 2012-05-21)
+// Integer maxResults = entitySearcher.getLimit();
+// if(maxResults == null){
+// maxResults = 1; //fall back to 1 if limit is not known
+// }
+// if(suggestions.isEmpty() && //if no suggestions where found
+// results.size() >= maxResults && //but the query had max results
+// //than the actual entity might not be within the first LIMIT results
+// searchStrings.size() > 1){ //if multiple words where used for the search
+// //try again with only a single word
+// suggestions = lookupEntities(Collections.singletonList(searchStrings.get(0)));
+//
+// }
//remove all elements > config.getMaxSuggestions()
return suggestions;
}
@@ -466,6 +482,7 @@ public class EntityLinker {
String labelTokenText = labelTokens[labelIndex];
if(labelTokenSet.remove(labelTokenText)){ //still not matched
currentToken = state.getSentence().getTokens().get(currentIndex);
+ boolean isProcessable = isProcessableToken(currentToken);
currentTokenText = currentToken.getText();
if(!config.isCaseSensitiveMatching()){
currentTokenText = currentTokenText.toLowerCase();
@@ -484,13 +501,16 @@ public class EntityLinker {
}
}
if(found){ //found
+ if(isProcessable){
+ foundProcessableTokens++; //only count processable Tokens
+ }
foundTokens++;
foundTokenMatch = foundTokenMatch + matchFactor; //sum up the matches
firstFoundIndex = currentIndex;
currentIndex --;
} else {
notFound++;
- if(notFound > maxNotFound){
+ if(isProcessable || notFound > maxNotFound){
//stop as soon as a token that needs to be processed is
//not found in the label or the maximum number of tokens
//that are not processable are not found
@@ -519,9 +539,15 @@ public class EntityLinker {
//of non-processable!
foundTokens = coveredTokens;
} else if((foundProcessableTokens >= config.getMinFoundTokens() ||
- foundTokens == coveredTokens) &&
+ //NOTE (rwesten, 2012-05-21): Do not check if all covered
+ // Tokens are found, but if all Tokens of the Label are
+ // matched! (STANBOL-622)
+ //foundTokens == coveredTokens) &&
+ foundTokens >= labelTokens.length) &&
labelMatchScore >= 0.6f){
- if(foundTokens == coveredTokens){
+ //same as above
+ //if(foundTokens == coveredTokens){
+ if(foundTokens == labelTokens.length && foundTokenMatch == coveredTokens){
labelMatch = MATCH.FULL;
} else {
labelMatch = MATCH.PARTIAL;
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Mon May 21 12:36:24 2012
@@ -64,4 +64,11 @@ public interface EntitySearcher {
* @return the state
*/
boolean supportsOfflineMode();
+
+ /**
+ * The maximum number of {@link Representation}s returned for {@link #lookup(String, Set, List, String...)}
+ * queries
+ * @return the Number or <code>null</code> if not known
+ */
+ Integer getLimit();
}
\ No newline at end of file
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/Suggestion.java Mon May 21 12:36:24 2012
@@ -110,12 +110,14 @@ public class Suggestion implements Compa
throw new IllegalArgumentException("For "+match+" matches the token span and count MUST BE > 0");
}
if(match == MATCH.PARTIAL){
- if(span <= count){
- throw new IllegalArgumentException("For "+match+" matches the token span MUST BE > than the token count!");
+ if(span <= count && labelTokenCount <= count){
+ throw new IllegalArgumentException("For "+match+" matches the (token span OR label token count) MUST BE > than the token count!");
}
} else {
if(span != count){
- throw new IllegalArgumentException("For "+match+" matches the token span MUST BE equals to the token count!");
+ throw new IllegalArgumentException("For "+match+" matches the token span '"
+ +span+"' MUST BE equals to the token count '"+count+"' (label: '"
+ +label.getText()+"')!");
}
}
}
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java Mon May 21 12:36:24 2012
@@ -33,8 +33,11 @@ import org.osgi.framework.BundleContext;
public final class EntityhubSearcher extends TrackingEntitySearcher<Entityhub> implements EntitySearcher {
- public EntityhubSearcher(BundleContext context) {
+ private final Integer limit;
+
+ public EntityhubSearcher(BundleContext context, Integer limit) {
super(context,Entityhub.class,null);
+ this.limit = limit != null && limit > 0 ? limit : null;
}
@Override
@@ -81,4 +84,9 @@ public final class EntityhubSearcher ext
return true; //the entityhub is always offline
}
+ @Override
+ public Integer getLimit() {
+ return limit;
+ }
+
}
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java Mon May 21 12:36:24 2012
@@ -35,12 +35,15 @@ import org.osgi.framework.BundleContext;
public final class ReferencedSiteSearcher extends TrackingEntitySearcher<ReferencedSite> implements EntitySearcher {
+
private final String siteId;
+ private final Integer limit;
- public ReferencedSiteSearcher(BundleContext context,String siteId) {
+ public ReferencedSiteSearcher(BundleContext context,String siteId, Integer limit) {
super(context, ReferencedSite.class,
Collections.singletonMap(SiteConfiguration.ID,siteId));
this.siteId = siteId;
+ this.limit = limit != null && limit > 0 ? limit : null;
}
@Override
@@ -74,6 +77,9 @@ public final class ReferencedSiteSearche
}
FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(),
field, includeFields, search, languages);
+ if(limit != null){
+ query.setLimit(limit);
+ }
QueryResultList<Representation> results;
try {
results = site.find(query);
@@ -91,4 +97,9 @@ public final class ReferencedSiteSearche
//Do not throw an exception here if the site is not available. Just return false
return site == null ? false : site.supportsLocalMode();
}
+
+ @Override
+ public Integer getLimit() {
+ return limit;
+ }
}
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java?rev=1340995&r1=1340994&r2=1340995&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Mon May 21 12:36:24 2012
@@ -98,4 +98,10 @@ public class TestSearcherImpl implements
return true;
}
+
+ @Override
+ public Integer getLimit() {
+ return null;
+ }
+
}