You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/31 16:18:58 UTC

svn commit: r1379463 - in /incubator/stanbol/branches/disambiguation-engine: defaults/src/main/resources/config/ engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/ engines/keywordextraction/src/main/java/org...

Author: rwesten
Date: Fri Aug 31 14:18:57 2012
New Revision: 1379463

URL: http://svn.apache.org/viewvc?rev=1379463&view=rev
Log:
KeywordlinkingEngine: UnitTests now work again, Improved Limit used by the EntitySearcher; Default Configuration: Corrected also some bugs in the configuration, KeywordLinkingEngine now uses 20 suggestions and 1 min found tokens (good for testing disambiguation as it results in a lot of suggestions)

Added:
    incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config
      - copied, changed from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
Removed:
    incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config
Modified:
    incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
    incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
    incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
    incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java

Copied: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config (from r1379385, incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config?p2=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config&p1=incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config&r1=1379385&r2=1379463&rev=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-default.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-defaultdisambiguation.config Fri Aug 31 14:18:57 2012
@@ -1,3 +1,3 @@
-stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]
-service.ranking=I"0"
\ No newline at end of file
+stanbol.enhancer.chain.name="default"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]
+service.ranking=I"-100"
\ No newline at end of file

Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-keyworddisambiguation.config Fri Aug 31 14:18:57 2012
@@ -1,3 +1,3 @@
-stanbol.enhancer.chain.name="default"
-stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","ner","dbpediaLinking","entityhubExtraction","disambiguation-mlt"]
-service.ranking=I"-100"
\ No newline at end of file
+stanbol.enhancer.chain.name="dbpedia-keyword-disambiguation"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpediaKeyword","disambiguation-mlt"]
+service.ranking=I"0"
\ No newline at end of file

Modified: incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config (original)
+++ incubator/stanbol/branches/disambiguation-engine/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.keywordextraction.engine.KeywordLinkingEngine-dbpediakeyword.config Fri Aug 31 14:18:57 2012
@@ -16,4 +16,5 @@ org.apache.stanbol.enhancer.engines.keyw
 org.apache.stanbol.enhancer.engines.keywordextraction.redirectField="rdfs:seeAlso"
 stanbol.enhancer.engine.name="dbpediaKeyword"
 org.apache.stanbol.enhancer.engines.keywordextraction.defaultMatchingLanguage="en"
-org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"
+org.apache.stanbol.enhancer.engines.keywordextraction.keywordTokenizer=B"false"
+org.apache.stanbol.enhancer.engines.keywordextraction.minFoundTokens=I"1"

Modified: incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/disambiguation-mlt/src/main/java/org/apache/stanbol/enhancer/engine/disambiguation/mlt/DisambiguatorEngine.java Fri Aug 31 14:18:57 2012
@@ -262,9 +262,7 @@ public class DisambiguatorEngine extends
                 disData.allSelectedTexts, 
                 window);
                 //savedEntity.getContext()); 
-            disambiguationContext = unionString(false,
-                Collections.singleton(savedEntity.getName()),
-                contextSelections);
+          disambiguationContext = unionString(false, contextSelections);
             
             //(2) I do not understand this variant (see comment for the 
             //    EntitiesInRange(..) method
@@ -278,6 +276,11 @@ public class DisambiguatorEngine extends
 //                Collections.singleton(context), //the context
 //                contextSelections); //other selected parsed in the context
             
+            //or just the name of the entity AND the context
+//            disambiguationContext = unionString(false,
+//                Collections.singleton(savedEntity.getName()),
+//                contextSelections);
+            
             //(4) TODO: I would also like to have the possibility to disambiguate
             //    using URIs of Entities suggested for other TextAnnotations
             //    within the context.

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Fri Aug 31 14:18:57 2012
@@ -205,7 +205,12 @@ public class KeywordLinkingEngine 
      * The literal representing the LangIDEngine as creator.
      */
     public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-    
+
+    /**
+     * The default value for the LIMIT of the {@link EntitySearcher}
+     */
+    private static final int DEFAULT_ENTITY_SEARCHER_LIMIT = 10;
+
     private EntitySearcher entitySearcher;
     private EntityLinkerConfig linkerConfig;
     private TextAnalyzerConfig nlpConfig;
@@ -873,9 +878,9 @@ public class KeywordLinkingEngine 
         }
         //TODO: make limit configurable!
         if(Entityhub.ENTITYHUB_IDS.contains(referencedSiteName.toLowerCase())){
-            entitySearcher = new EntityhubSearcher(context.getBundleContext(),10);
+            entitySearcher = new EntityhubSearcher(context.getBundleContext(),DEFAULT_ENTITY_SEARCHER_LIMIT);
         } else {
-            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,10);
+            entitySearcher = new ReferencedSiteSearcher(context.getBundleContext(),referencedSiteName,DEFAULT_ENTITY_SEARCHER_LIMIT);
         }
     }
     /**

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntityLinker.java Fri Aug 31 14:18:57 2012
@@ -56,6 +56,8 @@ public class EntityLinker {
      * The map holding the results of the linking process
      */
     private final Map<String,LinkedEntity> linkedEntities = new HashMap<String,LinkedEntity>();
+
+    private Integer lookupLimit;
     
     /**
      * After {@link #process()}ing this returns the entities linked for the
@@ -79,6 +81,7 @@ public class EntityLinker {
         this.entitySearcher = taxonomy;
         this.config = config;
         this.state = new ProcessingState(content.getAnalysedText());
+        this.lookupLimit  = Math.max(10,config.getMaxSuggestions()*2);
     }
     /**
      * Steps over the sentences, chunks, tokens of the {@link #sentences}
@@ -289,8 +292,11 @@ public class EntityLinker {
     private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
         Collection<? extends Representation> results;
         try {
-            results = entitySearcher.lookup(config.getNameField(),config.getSelectedFields(),
-            searchStrings, state.getSentence().getLanguage(),config.getDefaultLanguage());
+            results = entitySearcher.lookup(config.getNameField(),
+                config.getSelectedFields(),
+                searchStrings, 
+                new String[]{state.getSentence().getLanguage(),config.getDefaultLanguage()},
+                lookupLimit);
         } catch (RuntimeException e) {
             throw new EngineException(e.getMessage(),e);
         }
@@ -555,6 +561,7 @@ public class EntityLinker {
                 //processable tokens are counted, but Exact also checks
                 //of non-processable!
                 foundTokens = coveredTokens;
+                foundProcessableTokens = coveredProcessableTokens;
             } else if((foundProcessableTokens >= config.getMinFoundTokens() ||
                     //NOTE (rwesten, 2012-05-21): Do not check if all covered
                     //  Tokens are found, but if all Tokens of the Label are

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/EntitySearcher.java Fri Aug 31 14:18:57 2012
@@ -44,10 +44,11 @@ public interface EntitySearcher {
      * to be included. Other fields MAY also be included.
      * @param search the tokens to search for. MUST NOT be <code>null</code>
      * @param languages the languages to include in the search 
+     * @param limit The maximum number of resutls of <code>null</code> to use the default
      * @return the Representations found for the specified query
      * @throws T An exception while searching for concepts
      */
-    Collection<? extends Representation> lookup(String field,Set<String> includeFields,List<String> search,String...languages) throws IllegalStateException;
+    Collection<? extends Representation> lookup(String field, Set<String> includeFields, List<String> search, String[] languages,Integer limit) throws IllegalStateException;
     /**
      * Lookup a concept of the taxonomy by the id.
      * @param id the id

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/EntityhubSearcher.java Fri Aug 31 14:18:57 2012
@@ -62,13 +62,19 @@ public final class EntityhubSearcher ext
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         Entityhub entityhub = getSearchService();
         if(entityhub == null){
             throw new IllegalStateException("The Entityhub is currently not active");
         }
         FieldQuery query = EntitySearcherUtils.createFieldQuery(entityhub.getQueryFactory(),
             field, includeFields, search, languages);
+        if(limit != null && limit > 0){
+            query.setLimit(limit);
+        } else if(this.limit != null){
+            query.setLimit(this.limit);
+        }
         QueryResultList<Representation> results;
         try {
             results = entityhub.find(query);

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/linking/impl/ReferencedSiteSearcher.java Fri Aug 31 14:18:57 2012
@@ -69,7 +69,8 @@ public final class ReferencedSiteSearche
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         //build the query and than return the result
         Site site = getSearchService();
         if(site == null){
@@ -77,8 +78,10 @@ public final class ReferencedSiteSearche
         }
         FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(), 
             field, includeFields, search, languages);
-        if(limit != null){
+        if(limit != null && limit > 0){
             query.setLimit(limit);
+        } else if(this.limit != null){
+            query.setLimit(this.limit);
         }
         QueryResultList<Representation> results;
         try {

Modified: incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java?rev=1379463&r1=1379462&r2=1379463&view=diff
==============================================================================
--- incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java (original)
+++ incubator/stanbol/branches/disambiguation-engine/engines/keywordextraction/src/test/java/org/apache/stanbol/enhancer/engines/keywordextraction/impl/TestSearcherImpl.java Fri Aug 31 14:18:57 2012
@@ -74,7 +74,8 @@ public class TestSearcherImpl implements
     public Collection<? extends Representation> lookup(String field,
                                            Set<String> includeFields,
                                            List<String> search,
-                                           String... languages) throws IllegalStateException {
+                                           String[] languages,
+                                           Integer limit) throws IllegalStateException {
         if(field.equals(nameField)){
             //we do not need sorting
             //Representation needs to implement equals, therefore results filters multiple matches