You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/31 16:18:58 UTC
svn commit: r1379463 - in /incubator/stanbol/branches/disambiguation-engine:
defaults/src/main/resources/config/
engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/
engines/keywordextraction/src/main/java/org...
Author: rwesten
Date: Fri Aug 31 14:18:57 2012
New Revision: 1379463
URL: http://svn.apache.org/viewvc?rev=1379463&view=rev
Log:
KeywordlinkingEngine: UnitTests now work again, Improved Limit used by the EntitySearcher; Default Configuration: Corrected also some bugs in the configuration, KeywordLinkingEngine now uses 20 suggestions and 1 min found tokens (good for testing disambiguation as it results in a lot of suggestions)
Added:
incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config
- copied, changed from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
Removed:
incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
Modified:
incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
Copied: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config (from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config?p2=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config&p1=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config&r1=1379385&r2=1379463&rev=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config Fri Aug 31 14:18:57 2012
@@ -1,3 +1,3 @@
-stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]
-service.ranking=I"0"
\ No newline at end of file
+stanbol.enhancer.chain.name="default"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]
+service.ranking=I"-100"
\ No newline at end of file
Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config Fri Aug 31 14:18:57 2012
@@ -1,3 +1,3 @@
-stanbol.enhancer.chain.name="default"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]
-service.ranking=I"-100"
\ No newline at end of file
+stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]
+service.ranking=I"0"
\ No newline at end of file
Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config Fri Aug 31 14:18:57 2012
@@ -16,4 +16,5 @@ org.apache.stanbol.enhancer.engines.keyw
org.apache.stanbol.enhancer.engines.keywordextraction.redirectField="rdfs:seeAlso"
stanbol.enhancer.engine.name="dbpediaKeyword"
org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage="en"
-org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"
+org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens=I"1"
Modified: incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java Fri Aug 31 14:18:57 2012
@@ -262,9 +262,7 @@ public class DisambiguatorEngine extends
disData.allSelectedTexts,
window);
//savedEntity.getContext());
- disambiguationContext = unionString(false,
- Collections.singleton(savedEntity.getName()),
- contextSelections);
+ disambiguationContext = unionString(false, contextSelections);
//(2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
@@ -278,6 +276,11 @@ public class DisambiguatorEngine extends
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
+ //or just the name of the entity AND the context
+// disambiguationContext = unionString(false,
+// Collections.singleton(savedEntity.getName()),
+// contextSelections);
+
//(4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Aug 31 14:18:57 2012
@@ -205,7 +205,12 @@ public class KeywordLinkingEngine
* The literal representing the LangIDEngine as creator.
*/
public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-
+
+ /**
+ * The default value for the LIMIT of the {@link EntitySearcher}
+ */
+ private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;
+
private EntitySearcher entitySearcher;
private EntityLinkerConfig linkerConfig;
private TextAnalyzerConfig nlpConfig;
@@ -873,9 +878,9 @@ public class KeywordLinkingEngine
}
//TODO: make limit configurable!
if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
- entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
+ entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
} else {
- entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
+ entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
}
}
/**
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Aug 31 14:18:57 2012
@@ -56,6 +56,8 @@ public class EntityLinker {
* The map holding the results of the linking process
*/
private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+
+ private Integer lookupLimit;
/**
* After {@link #process()}ing this returns the entities linked for the
@@ -79,6 +81,7 @@ public class EntityLinker {
this.entitySearcher = taxonomy;
this.config = config;
this.state = new ProcessingState(content.getAnalysedText());
+ this.lookupLimit = Math.max(10,config.getMaxSuggestions()*2);
}
/**
* Steps over the sentences, chunks, tokens of the {@link #sentences}
@@ -289,8 +292,11 @@ public class EntityLinker {
private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
Collection<? extends Representation> results;
try {
- results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(),
- searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage());
+ results = entitySearcher.lookup(config.getNameField(),
+ config.getSelectedFields(),
+ searchStrings,
+ new String[]{state.getSentence().getLanguage(),config.getDefaultLanguage()},
+ lookupLimit);
} catch (RuntimeException e) {
throw new EngineException(e.getMessage(),e);
}
@@ -555,6 +561,7 @@ public class EntityLinker {
//processable tokens are counted, but Exact also checks
//of non-processable!
foundTokens = coveredTokens;
+ foundProcessableTokens = coveredProcessableTokens;
} else if((foundProcessableTokens >= config.getMinFoundTokens() ||
//NOTE (rwesten, 2012-05-21): Do not check if all covered
// Tokens are found, but if all Tokens of the Label are
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Fri Aug 31 14:18:57 2012
@@ -44,10 +44,11 @@ public interface EntitySearcher {
* to be included. Other fields MAY also be included.
* @param search the tokens to search for. MUST NOT be <code>null</code>
* @param languages the languages to include in the search
+ * @param limit The maximum number of resutls of <code>null</code> to use the default
* @return the Representations found for the specified query
* @throws T An exception while searching for concepts
*/
- Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException;
+ Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, String[] languages,Integer limit) throws IllegalStateException;
/**
* Lookup a concept of the taxonomy by the id.
* @param id the id
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java Fri Aug 31 14:18:57 2012
@@ -62,13 +62,19 @@ public final class EntityhubSearcher ext
public Collection<? extends Representation> lookup(String field,
Set<String> includeFields,
List<String> search,
- String... languages) throws IllegalStateException {
+ String[] languages,
+ Integer limit) throws IllegalStateException {
Entityhub entityhub = getSearchService();
if(entityhub == null){
throw new IllegalStateException("The Entityhub is currently not active");
}
FieldQuery query = EntitySearcherUtils.createFieldQuery(entityhub.getQueryFactory(),
field, includeFields, search, languages);
+ if(limit != null && limit > 0){
+ query.setLimit(limit);
+ } else if(this.limit != null){
+ query.setLimit(this.limit);
+ }
QueryResultList<Representation> results;
try {
results = entityhub.find(query);
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java Fri Aug 31 14:18:57 2012
@@ -69,7 +69,8 @@ public final class ReferencedSiteSearche
public Collection<? extends Representation> lookup(String field,
Set<String> includeFields,
List<String> search,
- String... languages) throws IllegalStateException {
+ String[] languages,
+ Integer limit) throws IllegalStateException {
//build the query and than return the result
Site site = getSearchService();
if(site == null){
@@ -77,8 +78,10 @@ public final class ReferencedSiteSearche
}
FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(),
field, includeFields, search, languages);
- if(limit != null){
+ if(limit != null && limit > 0){
query.setLimit(limit);
+ } else if(this.limit != null){
+ query.setLimit(this.limit);
}
QueryResultList<Representation> results;
try {
Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Fri Aug 31 14:18:57 2012
@@ -74,7 +74,8 @@ public class TestSearcherImpl implements
public Collection<? extends Representation> lookup(String field,
Set<String> includeFields,
List<String> search,
- String... languages) throws IllegalStateException {
+ String[] languages,
+ Integer limit) throws IllegalStateException {
if(field.equals(nameField)){
//we do not need sorting
//Representation needs to implement equals, therefore results filters multiple matches