You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/06/25 08:47:13 UTC

svn commit: r1496359 [1/2] - in /stanbol/trunk/enhancement-engines: entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/ entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ entityhubli...

Author: rwesten
Date: Tue Jun 25 06:47:12 2013
New Revision: 1496359

URL: http://svn.apache.org/r1496359
Log:
STANBOL-1114: Implementation of all the sub-tasks of this issue. See documentation of those issues for details.

Added:
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java
Modified:
    stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java
    stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java
    stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java
    stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
    stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java
    stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java
    stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
    stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java

Modified: stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java (original)
+++ stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java Tue Jun 25 06:47:12 2013
@@ -4,6 +4,7 @@ import java.util.Iterator;
 
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.PlainLiteral;
+import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.collections.IteratorUtils;
 import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
@@ -49,7 +50,7 @@ public class EntityMention extends Entit
      * {@link CoMentionConstants#CO_MENTION_LABEL_FIELD} is parsed as parameter
      * @param span the start/end char indexes of the mention
      */
-    public EntityMention(UriRef uri, MGraph data, UriRef labelField, UriRef typeField, Integer[] span) {
+    public EntityMention(UriRef uri, TripleCollection data, UriRef labelField, UriRef typeField, Integer[] span) {
         super(uri, data);
         if(labelField == null){
             throw new IllegalArgumentException("The LabelField MUST NOT be NULL!");

Modified: stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java (original)
+++ stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java Tue Jun 25 06:47:12 2013
@@ -100,15 +100,15 @@ public class InMemoryEntityIndex impleme
     }
     
     @Override
-    public Entity get(UriRef id, Set<UriRef> includeFields) throws IllegalStateException {
+    public Entity get(UriRef id, Set<UriRef> includeFields, String...languages) throws IllegalStateException {
         return entities.get(id);
     }
 
     @Override
     public Collection<? extends Entity> lookup(UriRef field,
                                            Set<UriRef> includeFields,
-                                           List<String> search,
-                                           String[] languages,Integer numResults) throws IllegalStateException {
+                                           List<String> search, String[] languages,
+                                           Integer numResults, Integer offset) throws IllegalStateException {
         //this assumes that 
         assert nameField.equals(field); //the nameField is the field
         assert Arrays.asList(languages).contains(language); //the parsed languages include the language
@@ -134,20 +134,30 @@ public class InMemoryEntityIndex impleme
         }
         @SuppressWarnings("unchecked") //TODO how to create generic arrays
         Entry<Entity,int[]>[] resultArray = results.entrySet().toArray(new Entry[results.size()]);
-        Arrays.sort(resultArray, RESULT_SCORE_COMPARATOR);
+        int index;
+        if(offset != null && offset.intValue() > 0){
+            index = offset.intValue();
+        } else {
+            index = 0;
+        }
+        if(index >= resultArray.length){ //no more results
+            return Collections.emptyList();
+        }
         //final ranking
-        List<Entity> resultList = new ArrayList<Entity>(Math.min(numResults+3, results.size()));
+        Arrays.sort(resultArray, RESULT_SCORE_COMPARATOR);
+        List<Entity> resultList = new ArrayList<Entity>(Math.min(numResults+3, (resultArray.length-index)));
         int lastScore = -1;
         boolean done = false;
-        for(int i = 0; i < resultArray.length && !done;i++){
-            if(i < numResults){
-                resultList.add(resultArray[i].getKey());
-                if(i == (numResults - 1)){ //memorize the score of the last included
-                    lastScore = resultArray[i].getValue()[0];
+        //start at the parsed offset
+        for(; index < resultArray.length && !done; index++){
+            if(index < numResults){
+                resultList.add(resultArray[index].getKey());
+                if(index == (numResults - 1)){ //memorize the score of the last included
+                    lastScore = resultArray[index].getValue()[0];
                 }
-            } else if (lastScore == resultArray[i].getValue()[0]){
+            } else if (lastScore == resultArray[index].getValue()[0]){
                 //include additional results with the same score
-                resultList.add(resultArray[i].getKey());
+                resultList.add(resultArray[index].getKey());
             } else { //cut of
                 done = true;
             }

Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java Tue Jun 25 06:47:12 2013
@@ -16,12 +16,18 @@
 */
 package org.apache.stanbol.enhancer.engines.entityhublinking;
 
+import java.util.Iterator;
+import java.util.Set;
+
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
 
 public class EntityhubEntity extends Entity {
@@ -29,12 +35,43 @@ public class EntityhubEntity extends Ent
     private static RdfValueFactory vf = RdfValueFactory.getInstance();
     private static UriRef entityRanking = new UriRef(RdfResourceEnum.entityRank.getUri());
     
-    public EntityhubEntity(Representation rep) {
+    public EntityhubEntity(Representation rep, Set<UriRef> fields, Set<String> languages) {
         super(new UriRef(rep.getId()), 
-            (MGraph)vf.toRdfRepresentation(rep).getRdfGraph());
+            toGraph(rep, fields, languages));
     }
     @Override
     public Float getEntityRanking() {
         return EnhancementEngineHelper.get(data, uri, entityRanking, Float.class, lf);
     }
+    /**
+     * Converts {@link Representation}s to RDF ({@link TripleCollection}) and
+     * also filter literals with languages other than the parsed one
+     * @param rep
+     * @param languages
+     * @return
+     */
+    private static TripleCollection toGraph(Representation rep, Set<UriRef> includeFields, Set<String> languages){
+        if (rep instanceof RdfRepresentation) {
+            return ((RdfRepresentation) rep).getRdfGraph();
+        } else {
+            //create the Clerezza Represenation
+            RdfRepresentation clerezzaRep = vf.createRepresentation(rep.getId());
+            //Copy all values field by field
+            for (Iterator<String> fields = rep.getFieldNames(); fields.hasNext();) {
+                String field = fields.next();
+                if(includeFields == null || includeFields.contains(field)){
+                    for (Iterator<Object> fieldValues = rep.get(field); fieldValues.hasNext();) {
+                        Object value = fieldValues.next();
+                        if(languages == null || //we need not to filter languages
+                                !(value instanceof Text) || //filter only Text values
+                                languages.contains(((Text)value).getLanguage())){
+                            clerezzaRep.add(field, value);
+                        }
+                    }
+                }
+            }
+            return clerezzaRep.getRdfGraph();
+        }
+        
+    }
 }
\ No newline at end of file

Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java Tue Jun 25 06:47:12 2013
@@ -26,6 +26,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
@@ -91,6 +92,7 @@ import org.slf4j.LoggerFactory;
     @Property(name=NAME_FIELD,value="rdfs:label"),
     @Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
     @Property(name=TYPE_FIELD,value="rdf:type"),
+    @Property(name=ENTITY_TYPES,cardinality=Integer.MAX_VALUE),
     @Property(name=REDIRECT_FIELD,value="rdfs:seeAlso"),
     @Property(name=REDIRECT_MODE,options={
         @PropertyOption(

Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java Tue Jun 25 06:47:12 2013
@@ -20,6 +20,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -57,7 +58,7 @@ public final class EntityhubSearcher ext
     }
     
     @Override
-    public Entity get(UriRef id,Set<UriRef> includeFields) throws EntitySearcherException {
+    public Entity get(UriRef id,Set<UriRef> fields, String...languages) throws EntitySearcherException {
         if(id == null || id.getUnicodeString().isEmpty()){
             return null;
         }
@@ -72,14 +73,27 @@ public final class EntityhubSearcher ext
             throw new EntitySearcherException("Exception while getting "+id+
                 " from the Entityhub",e);
         }
-        return entity == null ? null : new EntityhubEntity(entity.getRepresentation());
+        if(entity != null){
+            Set<String> languageSet;
+            if(languages == null || languages.length < 1){
+                languageSet = null;
+            } else if (languages.length == 1){
+                languageSet = Collections.singleton(languages[0]);
+            } else {
+                languageSet = new HashSet<String>(Arrays.asList(languages));
+            }
+            return new EntityhubEntity(entity.getRepresentation(), fields, languageSet);
+        } else {
+            return null;
+        }
     }
+
     @Override
     public Collection<? extends Entity> lookup(UriRef field,
                                            Set<UriRef> includeFields,
                                            List<String> search,
                                            String[] languages,
-                                           Integer limit) throws EntitySearcherException {
+                                           Integer limit, Integer offset) throws EntitySearcherException {
         Entityhub entityhub = getSearchService();
         if(entityhub == null){
             throw new EntitySearcherException("The Entityhub is currently not active");
@@ -91,6 +105,9 @@ public final class EntityhubSearcher ext
         } else if(this.limit != null){
             query.setLimit(this.limit);
         }
+        if(offset != null && offset.intValue() > 0){
+            query.setOffset(offset.intValue());
+        }
         QueryResultList<Representation> results;
         try {
             results = entityhub.find(query);
@@ -98,11 +115,16 @@ public final class EntityhubSearcher ext
             throw new EntitySearcherException("Exception while searchign for "+
                 search+'@'+Arrays.toString(languages)+"in the Entityhub", e);
         }
-        Collection<Entity> entities = new ArrayList<Entity>(results.size());
-        for(Representation result : results){
-            entities.add(new EntityhubEntity(result));
+        if(!results.isEmpty()){
+            Set<String> languagesSet = new HashSet<String>(Arrays.asList(languages));
+            Collection<Entity> entities = new ArrayList<Entity>(results.size());
+            for(Representation result : results){
+                entities.add(new EntityhubEntity(result, null, languagesSet));
+            }
+            return entities;
+        } else {
+            return Collections.emptyList();
         }
-        return entities;
     }
 
     @Override

Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java Tue Jun 25 06:47:12 2013
@@ -20,6 +20,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -29,6 +30,7 @@ import org.apache.clerezza.rdf.core.UriR
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
 import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Statistic;
 import org.apache.stanbol.entityhub.servicesapi.model.Representation;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
 import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
@@ -38,13 +40,18 @@ import org.apache.stanbol.entityhub.serv
 import org.apache.stanbol.entityhub.servicesapi.site.SiteException;
 import org.osgi.framework.BundleContext;
 import org.osgi.util.tracker.ServiceTrackerCustomizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public final class ReferencedSiteSearcher extends TrackingEntitySearcher<Site> implements EntitySearcher {
     
+    private final Logger log = LoggerFactory.getLogger(ReferencedSiteSearcher.class);
     
     private final String siteId;
     private final Integer limit;
     private Map<UriRef,Collection<Resource>> originInfo;
+    Statistic queryStats = new Statistic("query", 100, log);
+    Statistic resultStats = new Statistic("result", 1000, log);
     public ReferencedSiteSearcher(BundleContext context,String siteId, Integer limit){
         this(context,siteId,limit,null);
     }
@@ -61,7 +68,7 @@ public final class ReferencedSiteSearche
     }
     
     @Override
-    public Entity get(UriRef id,Set<UriRef> includeFields) {
+    public Entity get(UriRef id,Set<UriRef> fields, String ... languages) {
         if(id == null || id.getUnicodeString().isEmpty()){
             return null;
         }
@@ -76,7 +83,19 @@ public final class ReferencedSiteSearche
             throw new IllegalStateException("Exception while getting "+id+
                 " from the ReferencedSite "+site.getId(),e);
         }
-        return entity == null ? null : new EntityhubEntity(entity.getRepresentation());
+        if(entity != null){
+            Set<String> languageSet;
+            if(languages == null || languages.length < 1){
+                languageSet = null;
+            } else if (languages.length == 1){
+                languageSet = Collections.singleton(languages[0]);
+            } else {
+                languageSet = new HashSet<String>(Arrays.asList(languages));
+            }
+            return new EntityhubEntity(entity.getRepresentation(), fields, languageSet);
+        } else {
+            return null;
+        }
     }
 
     @Override
@@ -84,12 +103,13 @@ public final class ReferencedSiteSearche
                                            Set<UriRef> includeFields,
                                            List<String> search,
                                            String[] languages,
-                                           Integer limit) throws IllegalStateException {
+                                           Integer limit, Integer offset) throws IllegalStateException {
         //build the query and than return the result
         Site site = getSearchService();
         if(site == null){
             throw new IllegalStateException("ReferencedSite "+siteId+" is currently not available");
         }
+        queryStats.begin();
         FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(), 
             field, includeFields, search, languages);
         if(limit != null && limit > 0){
@@ -97,6 +117,9 @@ public final class ReferencedSiteSearche
         } else if(this.limit != null){
             query.setLimit(this.limit);
         }
+        if(offset != null && offset.intValue() > 0){
+            query.setOffset(offset.intValue());
+        }
         QueryResultList<Representation> results;
         try {
             results = site.find(query);
@@ -105,12 +128,20 @@ public final class ReferencedSiteSearche
                 search+'@'+Arrays.toString(languages)+"in the ReferencedSite "+
                 site.getId(), e);
         }
-        Collection<Entity> entities = new ArrayList<Entity>(results.size());
-        for(Representation result : results){
-            entities.add(new EntityhubEntity(result));
-        }
-        return entities;
+        queryStats.complete();
+        if(!results.isEmpty()){
+            Set<String> languagesSet = new HashSet<String>(Arrays.asList(languages));
+            Collection<Entity> entities = new ArrayList<Entity>(results.size());
+            for(Representation result : results){
+                resultStats.begin();
+                entities.add(new EntityhubEntity(result, null, languagesSet));
+                resultStats.complete();
+            }
+            return entities;
+        } else {
+            return Collections.emptyList();
         }
+    }
 
     @Override
     public boolean supportsOfflineMode() {

Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties Tue Jun 25 06:47:12 2013
@@ -126,3 +126,9 @@ of the text must match a Token of the La
 The score is calculated by comparing the matching characters from the beginning of the two \
 tokens compared to the overal size of the token. So it allows derivations at the end of the \
 of the tokens (e.g because of inflected forms of words).
+
+enhancer.engines.linking.entityTypes.name=Entity Type Filter
+enhancer.engines.linking.entityTypes.description=Allows to define a white/black list \
+based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for white \
+listing. Include '*' to force white listing (e.g. to allow Entities without any type). \
+Rules are processed based on their oder. 

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java Tue Jun 25 06:47:12 2013
@@ -22,6 +22,7 @@ import org.apache.clerezza.rdf.core.Lite
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.collections.Predicate;
@@ -57,14 +58,14 @@ public class Entity implements Comparabl
      * of the entity by containing {@link Triple}s that use the {@link #uri} as
      * {@link Triple#getSubject() subject}
      */
-    protected final MGraph data;
+    protected final TripleCollection data;
     
     /**
      * Constructs a new Entity
      * @param uri
      * @param data
      */
-    public Entity(UriRef uri, MGraph data) {
+    public Entity(UriRef uri, TripleCollection data) {
         this.uri = uri;
         this.data = data;
     }
@@ -74,7 +75,7 @@ public class Entity implements Comparabl
     public final String getId(){
         return uri.getUnicodeString();
     }
-    public final MGraph getData() {
+    public final TripleCollection getData() {
         return data;
     }
     @SuppressWarnings("unchecked")

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java Tue Jun 25 06:47:12 2013
@@ -21,6 +21,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.UriRef;
 
@@ -44,6 +45,7 @@ public interface EntitySearcher {
      * @param search the tokens to search for. MUST NOT be <code>null</code>
      * @param languages the languages to include in the search 
      * @param limit The maximum number of resutls of <code>null</code> to use the default
+     * @param offset The offset of the first requested search result
      * @return the Entities found for the specified query containing information for
      * all selected fields
      * @throws EntitySearcherException An exception while searching for concepts
@@ -51,19 +53,21 @@ public interface EntitySearcher {
      * the list with the search terms is <code>null</code> or empty;
      */
     Collection<? extends Entity> lookup(UriRef field, Set<UriRef> selectedFields, 
-        List<String> search, String[] languages, Integer limit) 
+        List<String> search, String[] languages, Integer limit, Integer offset) 
                 throws EntitySearcherException;
     /**
      * Lookup an Entity of the linked vocabulary by the id.
      * @param id the id
      * @param selectedFields A set of fields that need to be included within the 
      * returned {@link Representation}. Other fields MAY be also included.
+     * @param languages the list of languages for {@link PlainLiteral}s that
+     * should be included in the returned Entity
      * @return the concept or <code>null</code> if not found
      * @throws EntitySearcherException on any error while dereferencing the
      * Entity with the parsed Id
      * @throws IllegalArgumentException if the parsed id is <code>null</code>
      */
-    Entity get(UriRef id,Set<UriRef> selectedFields) throws EntitySearcherException;
+    Entity get(UriRef id,Set<UriRef> selectedFields, String...languages) throws EntitySearcherException;
     /**
      * Returns <code>true</code> if this EntitySearcher can operate without
      * dependencies to remote services. This is important because Stanbol can

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Tue Jun 25 06:47:12 2013
@@ -18,12 +18,14 @@ package org.apache.stanbol.enhancer.engi
 
 import java.net.URI;
 import java.net.URISyntaxException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -69,6 +71,10 @@ public class EntityLinkerConfig {
      */
     public static final String TYPE_FIELD = "enhancer.engines.linking.typeField";
     /**
+     * Allows to configure a list of entity types that are white/black listed.
+     */
+    public static final String ENTITY_TYPES = "enhancer.engines.linking.entityTypes";
+    /**
      * Allows to enable/disable case sensitive matching
      */
     public static final String CASE_SENSITIVE = "enhancer.engines.linking.caseSensitive";
@@ -374,6 +380,9 @@ public class EntityLinkerConfig {
     private UriRef nameField;
     private UriRef redirectField;
     private UriRef typeField;
+    private Map<UriRef,Integer> blacklistedTypes = new HashMap<UriRef,Integer>();
+    private Map<UriRef,Integer> whitelistedTypes = new HashMap<UriRef,Integer>();
+    private Boolean defaultWhitelistTypes = null;
     private Set<UriRef> dereferencedFields = new HashSet<UriRef>();
 
     private Set<UriRef> __selectedFields;
@@ -832,9 +841,72 @@ public class EntityLinkerConfig {
             linkerConfig.setRankEqualScoresBasedOnEntityRankings(
                 DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS);
         }
-        
+        //init the list of whitelisted/blacklisted types
+        value = configuration.get(ENTITY_TYPES);
+        List<String> entityTypesConfig; //first collect and cleanup the config
+        if(value == null){
+            entityTypesConfig = Collections.emptyList();
+        } else if(value instanceof String[]){
+            entityTypesConfig = new ArrayList<String>();
+            for(String type : (String[])value){
+                if(type != null){
+                    type = type.trim();
+                    if(!type.isEmpty()){
+                        entityTypesConfig.add(type);
+                    }
+                }
+            }
+        } else if(value instanceof Collection<?>){
+            entityTypesConfig = new ArrayList<String>();
+            for(Object o : (Collection<Object>)value){
+                if(o != null){
+                    String type = o.toString().trim();
+                    if(!type.isEmpty()){
+                        entityTypesConfig.add(type);
+                    }
+                }
+            }
+        } else if(value instanceof String){ //support parsing single values as string
+            String type = value.toString().trim();
+            if(type.isEmpty()){
+                entityTypesConfig = Collections.emptyList();
+            } else {
+                entityTypesConfig = Collections.singletonList(type);
+            }
+        } else {
+            throw new ConfigurationException(ENTITY_TYPES, "The list of ignored types (if present) "
+                + "MUST BE a collection or a string array (present: "+value.getClass().getName()+")!");
+        }
+        //apply the config
+        for(int i = 0; i < entityTypesConfig.size(); i++){
+            String type = entityTypesConfig.get(i);
+            if("*".equals(type)){
+                linkerConfig.setDefaultWhitelistTypes(Boolean.TRUE);
+            } else {
+                boolean blacklisted = type.charAt(0) == '!';
+                if(blacklisted && type.length() < 2){
+                    throw new ConfigurationException(ENTITY_TYPES, "The list of whitelisted/blacklisted "
+                        + "MUST NOT contain '!' (configured: "+entityTypesConfig+")!");
+                }
+                UriRef uri = new UriRef(getFullName(prefixService, ENTITY_TYPES, 
+                    blacklisted ? type.substring(1) : type));
+                if(blacklisted){
+                    linkerConfig.addBlacklistType(uri, Integer.valueOf(i));
+                } else {
+                    linkerConfig.addWhitelistType(uri, Integer.valueOf(i));
+                }
+            }
+        }
     }
-    
+    /**
+     * Gets the full URI for the parsed value by using the parsed {@link NamespacePrefixService}
+     * @param prefixService the {@link NamespacePrefixService} used to lookup the full URI
+     * @param property the config property (just used to create a {@link ConfigurationException}
+     * in case the used namespace prefix is unknown by the namespace prefix service)
+     * @param value the configured value (might be both a short or a full URI)
+     * @return the full URI
+     * @throws ConfigurationException
+     */
     private static String getFullName(NamespacePrefixService prefixService, String property,String value) throws ConfigurationException {
         String prefix = NamespaceMappingUtils.getPrefix(value);
         if(prefixService == null){
@@ -1334,4 +1406,61 @@ public class EntityLinkerConfig {
         this.rankEqualScoresBasedOnEntityRankings = state;
     }
     
+    /**
+     * Adds an type to the blacklist
+     */
+    public final void addBlacklistType(UriRef type, Integer order) {
+        if(type != null && order != null){
+            blacklistedTypes.put(type, order);
+        }
+    }
+    /**
+     * Adds an type to the blacklist
+     */
+    public final void addWhitelistType(UriRef type, Integer order) {
+        if(type != null && order != null){
+            whitelistedTypes.put(type, order);
+        }
+    }
+
+    public final void setDefaultWhitelistTypes(Boolean state){
+        this.defaultWhitelistTypes = state;
+    }
+    
+ 
+    public final boolean isDefaultWhitelistTypes(){
+        if(Boolean.FALSE.equals(defaultWhitelistTypes) && whitelistedTypes.isEmpty()){
+            //illegal configuration ... ignore
+            return true;
+        } else {
+            return defaultWhitelistTypes != null ? defaultWhitelistTypes.booleanValue() : 
+                whitelistedTypes.isEmpty(); //if whitelist is empty ... true
+        }
+    }
+    
+    /**
+     * @param ignoredTypes the ignoredTypes to set
+     */
+    public final Map<UriRef, Integer> getBlacklistedTypes() {
+        return blacklistedTypes;
+    }
+    
+    
+    /**
+     * @param ignoredTypes the ignoredTypes to set
+     */
+    public final Map<UriRef, Integer> getWhitelistedTypes() {
+        return whitelistedTypes;
+    }
+    /**
+     * checks if EntityType filtering is active or not
+     */
+    public final boolean isEntityTypeFilteringActive(){
+        if(whitelistedTypes.isEmpty() && blacklistedTypes.isEmpty()){
+            return false;
+        } else {
+            return true;
+        }
+    }
+    
 }
\ No newline at end of file

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Tue Jun 25 06:47:12 2013
@@ -72,6 +72,26 @@ public class LanguageProcessingConfig im
     public static final double DEFAULT_MIN_POS_ANNOTATION_PROBABILITY = 0.75;
 
     /**
+     * Default {@link LexicalCategory LexicalCategories} that allow the EntityLinker 
+     * to step-over non matchable tokens when determining search tokens for 
+     * Entityhub lookups (Defaults: {@link LexicalCategory#Noun}, 
+     * {@link LexicalCategory#Punctuation} and {@link LexicalCategory#Adposition}).
+     */
+    public static final Set<LexicalCategory> DEFAULT_CHUNKABLE_CATEGORIES = EnumSet.of(
+        LexicalCategory.Noun, LexicalCategory.Punctuation, LexicalCategory.Conjuction);
+    
+    /**
+     * Default {@link Pos} tags that allow the EntityLinker to step-over non matchable 
+     * tokens when determining search tokens for Entityhub lookups (default: empty).
+     */
+    private static final Set<Pos> DEFAULT_CHUNKABLE_POS = EnumSet.of(Pos.Preposition);
+    /**
+     * Default string tags that allow the EntityLinker to step-over non matchable 
+     * tokens when determining search tokens for Entityhub lookups (default: empty).
+     */
+    private static final Set<String> DEFAULT_CHUNKABKE_TAGS = Collections.emptySet();
+    
+    /**
      * Default value for POS annotation confidence required for not-processed POS tags
      * (not contained in both {@link #getLinkedLexicalCategories()} and 
      * {@link #getLinkedPosTags()}). <br> The default is 
@@ -139,6 +159,9 @@ public class LanguageProcessingConfig im
 
     private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
 
+    private Set<LexicalCategory> chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
+    private Set<Pos> chunkablePos = DEFAULT_CHUNKABLE_POS;
+    private Set<String> chunkableTags = DEFAULT_CHUNKABKE_TAGS;
 
     private double minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
 
@@ -516,6 +539,7 @@ public class LanguageProcessingConfig im
     public void setMinSearchTokenLength(int minSearchTokenLength) {
         this.minSearchTokenLength = minSearchTokenLength;
     }
+    
     /**
      * The minimum number of character a {@link Token} (word) must have to be
      * used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
@@ -546,6 +570,76 @@ public class LanguageProcessingConfig im
     }
     
     /**
+     * Getter for the chunkable {@link LexicalCategory LexicalCategories}. Those
+     * allow the EntityLinker to step-over non matchable tokens when determining 
+     * search tokens for Entityhub lookups.
+     * @return
+     */
+    public Set<LexicalCategory> getChunkableCategories(){
+        return chunkableCategories;
+    }
+    
+    /**
+     * Setter for the chunkable {@link LexicalCategory LexicalCategories}. Those
+     * allow the EntityLinker to step-over non matchable tokens when determining 
+     * search tokens for Entityhub lookups.
+     * @param categories The list of {@link LexicalCategory LexicalCategories} 
+     * considered as chunkable or <code>null</code> to reset to the default
+     */
+    public void setChunkableCategories(Set<LexicalCategory> categories){
+        if(categories == null){
+            this.chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
+        } else {
+            this.chunkableCategories = categories;
+        }
+    }
+
+    /**
+     * Setter for the {@link Pos} tags considered by the EntityLinker to step-over 
+     * non matchable tokens when determining search tokens for Entityhub lookups
+     * @param pos The list of {@link Pos} tags considered as chunkable or 
+     * <code>null</code> to reset to the default
+     */
+    public void setChunkablePos(Set<Pos> pos){
+        if(pos == null){
+            this.chunkablePos = DEFAULT_CHUNKABLE_POS;
+        } else {
+            this.chunkablePos = pos;
+        }
+    }
+    
+    /**
+     * Setter for the String tags considered by the EntityLinker to step-over 
+     * non matchable tokens when determining search tokens for Entityhub lookups
+     * @param tags The list of String tags considered as chunkable or 
+     * <code>null</code> to reset to the default
+     */
+    public void setChunkableTags(Set<String> tags){
+        if(tags == null){
+            this.chunkableTags = DEFAULT_CHUNKABKE_TAGS;
+        } else {
+            this.chunkableTags = tags;
+        }
+    }
+    /**
+     * Getter for the {@link Pos} tags considered by the EntityLinker to step-over 
+     * non matchable tokens when determining search tokens for Entityhub lookups
+     * @return
+     */
+    public Set<Pos> getChunkablePos(){
+        return chunkablePos;
+    }
+    
+    /**
+     * Getter for the String tags considered by the EntityLinker to step-over 
+     * non matchable tokens when determining search tokens for Entityhub lookups
+     * @return the String tags considered as chunkable
+     */
+    public Set<String> getChunkableTags(){
+        return chunkableTags;
+    }
+    
+    /**
      * Clones the {@link LanguageProcessingConfig}. Intended to be used
      * to create language specific configs based on the default one.
      */
@@ -568,6 +662,9 @@ public class LanguageProcessingConfig im
         c.matchedLexicalCategories = matchedLexicalCategories;
         c.minSearchTokenLength = minSearchTokenLength;
         c.linkOnlyUpperCaseTokenWithUnknownPos = linkOnlyUpperCaseTokenWithUnknownPos;
+        c.chunkableCategories = chunkableCategories;
+        c.chunkablePos = chunkablePos;
+        c.chunkableTags = chunkableTags;
         return c;
     }
 

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java Tue Jun 25 06:47:12 2013
@@ -126,6 +126,11 @@ public class TextProcessingConfig {
     public static final String PARAM_POS_TYPES = "pos";
     public static final String PARAM_POS_TAG = "tag";
     public static final String PARAM_POS_PROBABILITY = "prob";
+    
+    public static final String PARAM_CHUNKABLE_CATEGORIES = "cc";
+    public static final String PARAM_CHUNKABLE_POS_TYPES = "cp";
+    public static final String PARAM_CHUNKABLE_TAGS = "ct";
+    
     /**
      * Parameter used to configure how to deal with upper case tokens
      */
@@ -414,6 +419,24 @@ public class TextProcessingConfig {
         } else {
             log.info("   - use upper case token mode: match={}, link={}", tpc.isMatchUpperCaseTokens(), tpc.isLinkUpperCaseTokens());
         }
+        //apply chunkable parameters (STANBOL-1117)
+        if(config.containsKey(PARAM_CHUNKABLE_CATEGORIES)){
+            Set<LexicalCategory> chunkableCategories = parseEnumParam(config, PROCESSED_LANGUAGES, 
+            language, PARAM_CHUNKABLE_CATEGORIES, LexicalCategory.class);
+            log.info(" ... set chunkable Categories to {}", chunkableCategories);
+            tpc.setChunkableCategories(chunkableCategories);
+        }
+        if(config.containsKey(PARAM_CHUNKABLE_POS_TYPES)){
+            Set<Pos> chunkablePos = parseEnumParam(config, PROCESSED_LANGUAGES, 
+            language, PARAM_CHUNKABLE_POS_TYPES, Pos.class);
+            log.info(" ... set chunkable POS tags to {}", chunkablePos);
+            tpc.setChunkablePos(chunkablePos);
+        }
+        if(config.containsKey(PARAM_CHUNKABLE_TAGS)){
+            Set<String> chunkableTags = parseStringTags(config.get(PARAM_CHUNKABLE_TAGS));
+            log.info(" ... set chunkable String tags to {}", chunkableTags);
+            tpc.setChunkableTags(chunkableTags);
+        }
     }
 
     private static Boolean parseState(Map<String,String> config, String param){

Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java Tue Jun 25 06:47:12 2013
@@ -261,6 +261,9 @@ public class EntityLinkingEngine impleme
             log.error("Unable to link Entities with "+entityLinker,e);
             throw new EngineException(this, ci, "Unable to link Entities with "+entityLinker, e);
         }
+        if(log.isInfoEnabled()){
+            entityLinker.logStatistics(log);
+        }
         //write results (requires a write lock)
         ci.getLock().writeLock().lock();
         try {