You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/06/25 08:47:13 UTC
svn commit: r1496359 [1/2] - in /stanbol/trunk/enhancement-engines:
entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/
entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/
entityhubli...
Author: rwesten
Date: Tue Jun 25 06:47:12 2013
New Revision: 1496359
URL: http://svn.apache.org/r1496359
Log:
STANBOL-1114: Implementation of all the sub-tasks of this issue. See documentation of those issues for details.
Added:
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/Statistic.java
Modified:
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java
stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java
stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/EntityLinker.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/LabelMatch.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/ProcessingState.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngineTest.java
stanbol/trunk/enhancement-engines/entitylinking/engine/src/test/java/org/apache/stanbol/enhancer/engines/entitylinking/impl/TestSearcherImpl.java
Modified: stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java (original)
+++ stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/EntityMention.java Tue Jun 25 06:47:12 2013
@@ -4,6 +4,7 @@ import java.util.Iterator;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.PlainLiteral;
+import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.commons.collections.IteratorUtils;
import org.apache.stanbol.enhancer.engines.entitycomention.CoMentionConstants;
@@ -49,7 +50,7 @@ public class EntityMention extends Entit
* {@link CoMentionConstants#CO_MENTION_LABEL_FIELD} is parsed as parameter
* @param span the start/end char indexes of the mention
*/
- public EntityMention(UriRef uri, MGraph data, UriRef labelField, UriRef typeField, Integer[] span) {
+ public EntityMention(UriRef uri, TripleCollection data, UriRef labelField, UriRef typeField, Integer[] span) {
super(uri, data);
if(labelField == null){
throw new IllegalArgumentException("The LabelField MUST NOT be NULL!");
Modified: stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java (original)
+++ stanbol/trunk/enhancement-engines/entitycomention/src/main/java/org/apache/stanbol/enhancer/engines/entitycomention/impl/InMemoryEntityIndex.java Tue Jun 25 06:47:12 2013
@@ -100,15 +100,15 @@ public class InMemoryEntityIndex impleme
}
@Override
- public Entity get(UriRef id, Set<UriRef> includeFields) throws IllegalStateException {
+ public Entity get(UriRef id, Set<UriRef> includeFields, String...languages) throws IllegalStateException {
return entities.get(id);
}
@Override
public Collection<? extends Entity> lookup(UriRef field,
Set<UriRef> includeFields,
- List<String> search,
- String[] languages,Integer numResults) throws IllegalStateException {
+ List<String> search, String[] languages,
+ Integer numResults, Integer offset) throws IllegalStateException {
//this assumes that
assert nameField.equals(field); //the nameField is the field
assert Arrays.asList(languages).contains(language); //the parsed languages include the language
@@ -134,20 +134,30 @@ public class InMemoryEntityIndex impleme
}
@SuppressWarnings("unchecked") //TODO how to create generic arrays
Entry<Entity,int[]>[] resultArray = results.entrySet().toArray(new Entry[results.size()]);
- Arrays.sort(resultArray, RESULT_SCORE_COMPARATOR);
+ int index;
+ if(offset != null && offset.intValue() > 0){
+ index = offset.intValue();
+ } else {
+ index = 0;
+ }
+ if(index >= resultArray.length){ //no more results
+ return Collections.emptyList();
+ }
//final ranking
- List<Entity> resultList = new ArrayList<Entity>(Math.min(numResults+3, results.size()));
+ Arrays.sort(resultArray, RESULT_SCORE_COMPARATOR);
+ List<Entity> resultList = new ArrayList<Entity>(Math.min(numResults+3, (resultArray.length-index)));
int lastScore = -1;
boolean done = false;
- for(int i = 0; i < resultArray.length && !done;i++){
- if(i < numResults){
- resultList.add(resultArray[i].getKey());
- if(i == (numResults - 1)){ //memorize the score of the last included
- lastScore = resultArray[i].getValue()[0];
+ //start at the parsed offset
+ for(; index < resultArray.length && !done; index++){
+ if(index < numResults){
+ resultList.add(resultArray[index].getKey());
+ if(index == (numResults - 1)){ //memorize the score of the last included
+ lastScore = resultArray[index].getValue()[0];
}
- } else if (lastScore == resultArray[i].getValue()[0]){
+ } else if (lastScore == resultArray[index].getValue()[0]){
//include additional results with the same score
- resultList.add(resultArray[i].getKey());
+ resultList.add(resultArray[index].getKey());
} else { //cut of
done = true;
}
Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubEntity.java Tue Jun 25 06:47:12 2013
@@ -16,12 +16,18 @@
*/
package org.apache.stanbol.enhancer.engines.entityhublinking;
+import java.util.Iterator;
+import java.util.Set;
+
import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.entityhub.model.clerezza.RdfRepresentation;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
public class EntityhubEntity extends Entity {
@@ -29,12 +35,43 @@ public class EntityhubEntity extends Ent
private static RdfValueFactory vf = RdfValueFactory.getInstance();
private static UriRef entityRanking = new UriRef(RdfResourceEnum.entityRank.getUri());
- public EntityhubEntity(Representation rep) {
+ public EntityhubEntity(Representation rep, Set<UriRef> fields, Set<String> languages) {
super(new UriRef(rep.getId()),
- (MGraph)vf.toRdfRepresentation(rep).getRdfGraph());
+ toGraph(rep, fields, languages));
}
@Override
public Float getEntityRanking() {
return EnhancementEngineHelper.get(data, uri, entityRanking, Float.class, lf);
}
+ /**
+ * Converts {@link Representation}s to RDF ({@link TripleCollection}) and
+ * also filter literals with languages other than the parsed one
+ * @param rep
+ * @param languages
+ * @return
+ */
+ private static TripleCollection toGraph(Representation rep, Set<UriRef> includeFields, Set<String> languages){
+ if (rep instanceof RdfRepresentation) {
+ return ((RdfRepresentation) rep).getRdfGraph();
+ } else {
+ //create the Clerezza Represenation
+ RdfRepresentation clerezzaRep = vf.createRepresentation(rep.getId());
+ //Copy all values field by field
+ for (Iterator<String> fields = rep.getFieldNames(); fields.hasNext();) {
+ String field = fields.next();
+ if(includeFields == null || includeFields.contains(field)){
+ for (Iterator<Object> fieldValues = rep.get(field); fieldValues.hasNext();) {
+ Object value = fieldValues.next();
+ if(languages == null || //we need not to filter languages
+ !(value instanceof Text) || //filter only Text values
+ languages.contains(((Text)value).getLanguage())){
+ clerezzaRep.add(field, value);
+ }
+ }
+ }
+ }
+ return clerezzaRep.getRdfGraph();
+ }
+
+ }
}
\ No newline at end of file
Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubLinkingEngine.java Tue Jun 25 06:47:12 2013
@@ -26,6 +26,7 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.DEREFERENCE_ENTITIES_FIELDS;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.MIN_SEARCH_TOKEN_LENGTH;
+import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.ENTITY_TYPES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.MIN_TOKEN_SCORE;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.NAME_FIELD;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.EntityLinkerConfig.REDIRECT_FIELD;
@@ -91,6 +92,7 @@ import org.slf4j.LoggerFactory;
@Property(name=NAME_FIELD,value="rdfs:label"),
@Property(name=CASE_SENSITIVE,boolValue=DEFAULT_CASE_SENSITIVE_MATCHING_STATE),
@Property(name=TYPE_FIELD,value="rdf:type"),
+ @Property(name=ENTITY_TYPES,cardinality=Integer.MAX_VALUE),
@Property(name=REDIRECT_FIELD,value="rdfs:seeAlso"),
@Property(name=REDIRECT_MODE,options={
@PropertyOption(
Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/EntityhubSearcher.java Tue Jun 25 06:47:12 2013
@@ -20,6 +20,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -57,7 +58,7 @@ public final class EntityhubSearcher ext
}
@Override
- public Entity get(UriRef id,Set<UriRef> includeFields) throws EntitySearcherException {
+ public Entity get(UriRef id,Set<UriRef> fields, String...languages) throws EntitySearcherException {
if(id == null || id.getUnicodeString().isEmpty()){
return null;
}
@@ -72,14 +73,27 @@ public final class EntityhubSearcher ext
throw new EntitySearcherException("Exception while getting "+id+
" from the Entityhub",e);
}
- return entity == null ? null : new EntityhubEntity(entity.getRepresentation());
+ if(entity != null){
+ Set<String> languageSet;
+ if(languages == null || languages.length < 1){
+ languageSet = null;
+ } else if (languages.length == 1){
+ languageSet = Collections.singleton(languages[0]);
+ } else {
+ languageSet = new HashSet<String>(Arrays.asList(languages));
+ }
+ return new EntityhubEntity(entity.getRepresentation(), fields, languageSet);
+ } else {
+ return null;
+ }
}
+
@Override
public Collection<? extends Entity> lookup(UriRef field,
Set<UriRef> includeFields,
List<String> search,
String[] languages,
- Integer limit) throws EntitySearcherException {
+ Integer limit, Integer offset) throws EntitySearcherException {
Entityhub entityhub = getSearchService();
if(entityhub == null){
throw new EntitySearcherException("The Entityhub is currently not active");
@@ -91,6 +105,9 @@ public final class EntityhubSearcher ext
} else if(this.limit != null){
query.setLimit(this.limit);
}
+ if(offset != null && offset.intValue() > 0){
+ query.setOffset(offset.intValue());
+ }
QueryResultList<Representation> results;
try {
results = entityhub.find(query);
@@ -98,11 +115,16 @@ public final class EntityhubSearcher ext
throw new EntitySearcherException("Exception while searchign for "+
search+'@'+Arrays.toString(languages)+"in the Entityhub", e);
}
- Collection<Entity> entities = new ArrayList<Entity>(results.size());
- for(Representation result : results){
- entities.add(new EntityhubEntity(result));
+ if(!results.isEmpty()){
+ Set<String> languagesSet = new HashSet<String>(Arrays.asList(languages));
+ Collection<Entity> entities = new ArrayList<Entity>(results.size());
+ for(Representation result : results){
+ entities.add(new EntityhubEntity(result, null, languagesSet));
+ }
+ return entities;
+ } else {
+ return Collections.emptyList();
}
- return entities;
}
@Override
Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/java/org/apache/stanbol/enhancer/engines/entityhublinking/ReferencedSiteSearcher.java Tue Jun 25 06:47:12 2013
@@ -20,6 +20,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -29,6 +30,7 @@ import org.apache.clerezza.rdf.core.UriR
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.stanbol.enhancer.engines.entitylinking.Entity;
import org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcher;
+import org.apache.stanbol.enhancer.engines.entitylinking.impl.Statistic;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
@@ -38,13 +40,18 @@ import org.apache.stanbol.entityhub.serv
import org.apache.stanbol.entityhub.servicesapi.site.SiteException;
import org.osgi.framework.BundleContext;
import org.osgi.util.tracker.ServiceTrackerCustomizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public final class ReferencedSiteSearcher extends TrackingEntitySearcher<Site> implements EntitySearcher {
+ private final Logger log = LoggerFactory.getLogger(ReferencedSiteSearcher.class);
private final String siteId;
private final Integer limit;
private Map<UriRef,Collection<Resource>> originInfo;
+ Statistic queryStats = new Statistic("query", 100, log);
+ Statistic resultStats = new Statistic("result", 1000, log);
public ReferencedSiteSearcher(BundleContext context,String siteId, Integer limit){
this(context,siteId,limit,null);
}
@@ -61,7 +68,7 @@ public final class ReferencedSiteSearche
}
@Override
- public Entity get(UriRef id,Set<UriRef> includeFields) {
+ public Entity get(UriRef id,Set<UriRef> fields, String ... languages) {
if(id == null || id.getUnicodeString().isEmpty()){
return null;
}
@@ -76,7 +83,19 @@ public final class ReferencedSiteSearche
throw new IllegalStateException("Exception while getting "+id+
" from the ReferencedSite "+site.getId(),e);
}
- return entity == null ? null : new EntityhubEntity(entity.getRepresentation());
+ if(entity != null){
+ Set<String> languageSet;
+ if(languages == null || languages.length < 1){
+ languageSet = null;
+ } else if (languages.length == 1){
+ languageSet = Collections.singleton(languages[0]);
+ } else {
+ languageSet = new HashSet<String>(Arrays.asList(languages));
+ }
+ return new EntityhubEntity(entity.getRepresentation(), fields, languageSet);
+ } else {
+ return null;
+ }
}
@Override
@@ -84,12 +103,13 @@ public final class ReferencedSiteSearche
Set<UriRef> includeFields,
List<String> search,
String[] languages,
- Integer limit) throws IllegalStateException {
+ Integer limit, Integer offset) throws IllegalStateException {
//build the query and than return the result
Site site = getSearchService();
if(site == null){
throw new IllegalStateException("ReferencedSite "+siteId+" is currently not available");
}
+ queryStats.begin();
FieldQuery query = EntitySearcherUtils.createFieldQuery(site.getQueryFactory(),
field, includeFields, search, languages);
if(limit != null && limit > 0){
@@ -97,6 +117,9 @@ public final class ReferencedSiteSearche
} else if(this.limit != null){
query.setLimit(this.limit);
}
+ if(offset != null && offset.intValue() > 0){
+ query.setOffset(offset.intValue());
+ }
QueryResultList<Representation> results;
try {
results = site.find(query);
@@ -105,12 +128,20 @@ public final class ReferencedSiteSearche
search+'@'+Arrays.toString(languages)+"in the ReferencedSite "+
site.getId(), e);
}
- Collection<Entity> entities = new ArrayList<Entity>(results.size());
- for(Representation result : results){
- entities.add(new EntityhubEntity(result));
- }
- return entities;
+ queryStats.complete();
+ if(!results.isEmpty()){
+ Set<String> languagesSet = new HashSet<String>(Arrays.asList(languages));
+ Collection<Entity> entities = new ArrayList<Entity>(results.size());
+ for(Representation result : results){
+ resultStats.begin();
+ entities.add(new EntityhubEntity(result, null, languagesSet));
+ resultStats.complete();
+ }
+ return entities;
+ } else {
+ return Collections.emptyList();
}
+ }
@Override
public boolean supportsOfflineMode() {
Modified: stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/trunk/enhancement-engines/entityhublinking/src/main/resources/OSGI-INF/metatype/metatype.properties Tue Jun 25 06:47:12 2013
@@ -126,3 +126,9 @@ of the text must match a Token of the La
The score is calculated by comparing the matching characters from the beginning of the two \
tokens compared to the overal size of the token. So it allows derivations at the end of the \
of the tokens (e.g because of inflected forms of words).
+
+enhancer.engines.linking.entityTypes.name=Entity Type Filter
+enhancer.engines.linking.entityTypes.description=Allows to define a white/black list \
+based on the types of Entities. Use '!{uri}' for black listing and '{uri}' for white \
+listing. Include '*' to force white listing (e.g. to allow Entities without any type). \
+Rules are processed based on their oder.
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/Entity.java Tue Jun 25 06:47:12 2013
@@ -22,6 +22,7 @@ import org.apache.clerezza.rdf.core.Lite
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.TypedLiteral;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.commons.collections.Predicate;
@@ -57,14 +58,14 @@ public class Entity implements Comparabl
* of the entity by containing {@link Triple}s that use the {@link #uri} as
* {@link Triple#getSubject() subject}
*/
- protected final MGraph data;
+ protected final TripleCollection data;
/**
* Constructs a new Entity
* @param uri
* @param data
*/
- public Entity(UriRef uri, MGraph data) {
+ public Entity(UriRef uri, TripleCollection data) {
this.uri = uri;
this.data = data;
}
@@ -74,7 +75,7 @@ public class Entity implements Comparabl
public final String getId(){
return uri.getUnicodeString();
}
- public final MGraph getData() {
+ public final TripleCollection getData() {
return data;
}
@SuppressWarnings("unchecked")
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/EntitySearcher.java Tue Jun 25 06:47:12 2013
@@ -21,6 +21,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
@@ -44,6 +45,7 @@ public interface EntitySearcher {
* @param search the tokens to search for. MUST NOT be <code>null</code>
* @param languages the languages to include in the search
* @param limit The maximum number of resutls of <code>null</code> to use the default
+ * @param offset The offset of the first requested search result
* @return the Entities found for the specified query containing information for
* all selected fields
* @throws EntitySearcherException An exception while searching for concepts
@@ -51,19 +53,21 @@ public interface EntitySearcher {
* the list with the search terms is <code>null</code> or empty;
*/
Collection<? extends Entity> lookup(UriRef field, Set<UriRef> selectedFields,
- List<String> search, String[] languages, Integer limit)
+ List<String> search, String[] languages, Integer limit, Integer offset)
throws EntitySearcherException;
/**
* Lookup an Entity of the linked vocabulary by the id.
* @param id the id
* @param selectedFields A set of fields that need to be included within the
* returned {@link Representation}. Other fields MAY be also included.
+ * @param languages the list of languages for {@link PlainLiteral}s that
+ * should be included in the returned Entity
* @return the concept or <code>null</code> if not found
* @throws EntitySearcherException on any error while dereferencing the
* Entity with the parsed Id
* @throws IllegalArgumentException if the parsed id is <code>null</code>
*/
- Entity get(UriRef id,Set<UriRef> selectedFields) throws EntitySearcherException;
+ Entity get(UriRef id,Set<UriRef> selectedFields, String...languages) throws EntitySearcherException;
/**
* Returns <code>true</code> if this EntitySearcher can operate without
* dependencies to remote services. This is important because Stanbol can
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/EntityLinkerConfig.java Tue Jun 25 06:47:12 2013
@@ -18,12 +18,14 @@ package org.apache.stanbol.enhancer.engi
import java.net.URI;
import java.net.URISyntaxException;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -69,6 +71,10 @@ public class EntityLinkerConfig {
*/
public static final String TYPE_FIELD = "enhancer.engines.linking.typeField";
/**
+ * Allows to configure a list of entity types that are white/black listed.
+ */
+ public static final String ENTITY_TYPES = "enhancer.engines.linking.entityTypes";
+ /**
* Allows to enable/disable case sensitive matching
*/
public static final String CASE_SENSITIVE = "enhancer.engines.linking.caseSensitive";
@@ -374,6 +380,9 @@ public class EntityLinkerConfig {
private UriRef nameField;
private UriRef redirectField;
private UriRef typeField;
+ private Map<UriRef,Integer> blacklistedTypes = new HashMap<UriRef,Integer>();
+ private Map<UriRef,Integer> whitelistedTypes = new HashMap<UriRef,Integer>();
+ private Boolean defaultWhitelistTypes = null;
private Set<UriRef> dereferencedFields = new HashSet<UriRef>();
private Set<UriRef> __selectedFields;
@@ -832,9 +841,72 @@ public class EntityLinkerConfig {
linkerConfig.setRankEqualScoresBasedOnEntityRankings(
DEFAULT_RANK_EQUAL_SCORES_BASED_ON_ENTITY_RANKINGS);
}
-
+ //init the list of whitelisted/blacklisted types
+ value = configuration.get(ENTITY_TYPES);
+ List<String> entityTypesConfig; //first collect and cleanup the config
+ if(value == null){
+ entityTypesConfig = Collections.emptyList();
+ } else if(value instanceof String[]){
+ entityTypesConfig = new ArrayList<String>();
+ for(String type : (String[])value){
+ if(type != null){
+ type = type.trim();
+ if(!type.isEmpty()){
+ entityTypesConfig.add(type);
+ }
+ }
+ }
+ } else if(value instanceof Collection<?>){
+ entityTypesConfig = new ArrayList<String>();
+ for(Object o : (Collection<Object>)value){
+ if(o != null){
+ String type = o.toString().trim();
+ if(!type.isEmpty()){
+ entityTypesConfig.add(type);
+ }
+ }
+ }
+ } else if(value instanceof String){ //support parsing single values as string
+ String type = value.toString().trim();
+ if(type.isEmpty()){
+ entityTypesConfig = Collections.emptyList();
+ } else {
+ entityTypesConfig = Collections.singletonList(type);
+ }
+ } else {
+ throw new ConfigurationException(ENTITY_TYPES, "The list of ignored types (if present) "
+ + "MUST BE a collection or a string array (present: "+value.getClass().getName()+")!");
+ }
+ //apply the config
+ for(int i = 0; i < entityTypesConfig.size(); i++){
+ String type = entityTypesConfig.get(i);
+ if("*".equals(type)){
+ linkerConfig.setDefaultWhitelistTypes(Boolean.TRUE);
+ } else {
+ boolean blacklisted = type.charAt(0) == '!';
+ if(blacklisted && type.length() < 2){
+ throw new ConfigurationException(ENTITY_TYPES, "The list of whitelisted/blacklisted "
+ + "MUST NOT contain '!' (configured: "+entityTypesConfig+")!");
+ }
+ UriRef uri = new UriRef(getFullName(prefixService, ENTITY_TYPES,
+ blacklisted ? type.substring(1) : type));
+ if(blacklisted){
+ linkerConfig.addBlacklistType(uri, Integer.valueOf(i));
+ } else {
+ linkerConfig.addWhitelistType(uri, Integer.valueOf(i));
+ }
+ }
+ }
}
-
+ /**
+ * Gets the full URI for the parsed value by using the parsed {@link NamespacePrefixService}
+ * @param prefixService the {@link NamespacePrefixService} used to lookup the full URI
+ * @param property the config property (just used to create a {@link ConfigurationException}
+ * in case the used namespace prefix is unknown by the namespace prefix service)
+ * @param value the configured value (might be both a short or a full URI)
+ * @return the full URI
+ * @throws ConfigurationException
+ */
private static String getFullName(NamespacePrefixService prefixService, String property,String value) throws ConfigurationException {
String prefix = NamespaceMappingUtils.getPrefix(value);
if(prefixService == null){
@@ -1334,4 +1406,61 @@ public class EntityLinkerConfig {
this.rankEqualScoresBasedOnEntityRankings = state;
}
+ /**
+ * Adds an type to the blacklist
+ */
+ public final void addBlacklistType(UriRef type, Integer order) {
+ if(type != null && order != null){
+ blacklistedTypes.put(type, order);
+ }
+ }
+ /**
+ * Adds an type to the blacklist
+ */
+ public final void addWhitelistType(UriRef type, Integer order) {
+ if(type != null && order != null){
+ whitelistedTypes.put(type, order);
+ }
+ }
+
+ public final void setDefaultWhitelistTypes(Boolean state){
+ this.defaultWhitelistTypes = state;
+ }
+
+
+ public final boolean isDefaultWhitelistTypes(){
+ if(Boolean.FALSE.equals(defaultWhitelistTypes) && whitelistedTypes.isEmpty()){
+ //illegal configuration ... ignore
+ return true;
+ } else {
+ return defaultWhitelistTypes != null ? defaultWhitelistTypes.booleanValue() :
+ whitelistedTypes.isEmpty(); //if whitelist is empty ... true
+ }
+ }
+
+ /**
+ * @param ignoredTypes the ignoredTypes to set
+ */
+ public final Map<UriRef, Integer> getBlacklistedTypes() {
+ return blacklistedTypes;
+ }
+
+
+ /**
+ * @param ignoredTypes the ignoredTypes to set
+ */
+ public final Map<UriRef, Integer> getWhitelistedTypes() {
+ return whitelistedTypes;
+ }
+ /**
+ * checks if EntityType filtering is active or not
+ */
+ public final boolean isEntityTypeFilteringActive(){
+ if(whitelistedTypes.isEmpty() && blacklistedTypes.isEmpty()){
+ return false;
+ } else {
+ return true;
+ }
+ }
+
}
\ No newline at end of file
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/LanguageProcessingConfig.java Tue Jun 25 06:47:12 2013
@@ -72,6 +72,26 @@ public class LanguageProcessingConfig im
public static final double DEFAULT_MIN_POS_ANNOTATION_PROBABILITY = 0.75;
/**
+ * Default {@link LexicalCategory LexicalCategories} that allow the EntityLinker
+ * to step-over non matchable tokens when determining search tokens for
+ * Entityhub lookups (Defaults: {@link LexicalCategory#Noun},
+ * {@link LexicalCategory#Punctuation} and {@link LexicalCategory#Adposition}).
+ */
+ public static final Set<LexicalCategory> DEFAULT_CHUNKABLE_CATEGORIES = EnumSet.of(
+ LexicalCategory.Noun, LexicalCategory.Punctuation, LexicalCategory.Conjuction);
+
+ /**
+ * Default {@link Pos} tags that allow the EntityLinker to step-over non matchable
+ * tokens when determining search tokens for Entityhub lookups (default: empty).
+ */
+ private static final Set<Pos> DEFAULT_CHUNKABLE_POS = EnumSet.of(Pos.Preposition);
+ /**
+ * Default string tags that allow the EntityLinker to step-over non matchable
+ * tokens when determining search tokens for Entityhub lookups (default: empty).
+ */
+ private static final Set<String> DEFAULT_CHUNKABKE_TAGS = Collections.emptySet();
+
+ /**
* Default value for POS annotation confidence required for not-processed POS tags
* (not contained in both {@link #getLinkedLexicalCategories()} and
* {@link #getLinkedPosTags()}). <br> The default is
@@ -139,6 +159,9 @@ public class LanguageProcessingConfig im
private boolean ignoreChunksState = DEFAULT_IGNORE_CHUNK_STATE;
+ private Set<LexicalCategory> chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
+ private Set<Pos> chunkablePos = DEFAULT_CHUNKABLE_POS;
+ private Set<String> chunkableTags = DEFAULT_CHUNKABKE_TAGS;
private double minPhraseAnnotationProbability = DEFAULT_MIN_PHRASE_ANNOTATION_PROBABILITY;
@@ -516,6 +539,7 @@ public class LanguageProcessingConfig im
public void setMinSearchTokenLength(int minSearchTokenLength) {
this.minSearchTokenLength = minSearchTokenLength;
}
+
/**
* The minimum number of character a {@link Token} (word) must have to be
* used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
@@ -546,6 +570,76 @@ public class LanguageProcessingConfig im
}
/**
+ * Getter for the chunkable {@link LexicalCategory LexicalCategories}. Those
+ * allow the EntityLinker to step-over non matchable tokens when determining
+ * search tokens for Entityhub lookups.
+ * @return
+ */
+ public Set<LexicalCategory> getChunkableCategories(){
+ return chunkableCategories;
+ }
+
+ /**
+ * Setter for the chunkable {@link LexicalCategory LexicalCategories}. Those
+ * allow the EntityLinker to step-over non matchable tokens when determining
+ * search tokens for Entityhub lookups.
+ * @param categories The list of {@link LexicalCategory LexicalCategories}
+ * considered as chunkable or <code>null</code> to reset to the default
+ */
+ public void setChunkableCategories(Set<LexicalCategory> categories){
+ if(categories == null){
+ this.chunkableCategories = DEFAULT_CHUNKABLE_CATEGORIES;
+ } else {
+ this.chunkableCategories = categories;
+ }
+ }
+
+ /**
+ * Setter for the {@link Pos} tags considered by the EntityLinker to step-over
+ * non matchable tokens when determining search tokens for Entityhub lookups
+ * @param pos The list of {@link Pos} tags considered as chunkable or
+ * <code>null</code> to reset to the default
+ */
+ public void setChunkablePos(Set<Pos> pos){
+ if(pos == null){
+ this.chunkablePos = DEFAULT_CHUNKABLE_POS;
+ } else {
+ this.chunkablePos = pos;
+ }
+ }
+
+ /**
+ * Setter for the String tags considered by the EntityLinker to step-over
+ * non matchable tokens when determining search tokens for Entityhub lookups
+ * @param tags The list of String tags considered as chunkable or
+ * <code>null</code> to reset to the default
+ */
+ public void setChunkableTags(Set<String> tags){
+ if(tags == null){
+ this.chunkableTags = DEFAULT_CHUNKABKE_TAGS;
+ } else {
+ this.chunkableTags = tags;
+ }
+ }
+ /**
+ * Getter for the {@link Pos} tags considered by the EntityLinker to step-over
+ * non matchable tokens when determining search tokens for Entityhub lookups
+ * @return
+ */
+ public Set<Pos> getChunkablePos(){
+ return chunkablePos;
+ }
+
+ /**
+ * Getter for the String tags considered by the EntityLinker to step-over
+ * non matchable tokens when determining search tokens for Entityhub lookups
+ * @return the String tags considered as chunkable
+ */
+ public Set<String> getChunkableTags(){
+ return chunkableTags;
+ }
+
+ /**
* Clones the {@link LanguageProcessingConfig}. Intended to be used
* to create language specific configs based on the default one.
*/
@@ -568,6 +662,9 @@ public class LanguageProcessingConfig im
c.matchedLexicalCategories = matchedLexicalCategories;
c.minSearchTokenLength = minSearchTokenLength;
c.linkOnlyUpperCaseTokenWithUnknownPos = linkOnlyUpperCaseTokenWithUnknownPos;
+ c.chunkableCategories = chunkableCategories;
+ c.chunkablePos = chunkablePos;
+ c.chunkableTags = chunkableTags;
return c;
}
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/config/TextProcessingConfig.java Tue Jun 25 06:47:12 2013
@@ -126,6 +126,11 @@ public class TextProcessingConfig {
public static final String PARAM_POS_TYPES = "pos";
public static final String PARAM_POS_TAG = "tag";
public static final String PARAM_POS_PROBABILITY = "prob";
+
+ public static final String PARAM_CHUNKABLE_CATEGORIES = "cc";
+ public static final String PARAM_CHUNKABLE_POS_TYPES = "cp";
+ public static final String PARAM_CHUNKABLE_TAGS = "ct";
+
/**
* Parameter used to configure how to deal with upper case tokens
*/
@@ -414,6 +419,24 @@ public class TextProcessingConfig {
} else {
log.info(" - use upper case token mode: match={}, link={}", tpc.isMatchUpperCaseTokens(), tpc.isLinkUpperCaseTokens());
}
+ //apply chunkable parameters (STANBOL-1117)
+ if(config.containsKey(PARAM_CHUNKABLE_CATEGORIES)){
+ Set<LexicalCategory> chunkableCategories = parseEnumParam(config, PROCESSED_LANGUAGES,
+ language, PARAM_CHUNKABLE_CATEGORIES, LexicalCategory.class);
+ log.info(" ... set chunkable Categories to {}", chunkableCategories);
+ tpc.setChunkableCategories(chunkableCategories);
+ }
+ if(config.containsKey(PARAM_CHUNKABLE_POS_TYPES)){
+ Set<Pos> chunkablePos = parseEnumParam(config, PROCESSED_LANGUAGES,
+ language, PARAM_CHUNKABLE_POS_TYPES, Pos.class);
+ log.info(" ... set chunkable POS tags to {}", chunkablePos);
+ tpc.setChunkablePos(chunkablePos);
+ }
+ if(config.containsKey(PARAM_CHUNKABLE_TAGS)){
+ Set<String> chunkableTags = parseStringTags(config.get(PARAM_CHUNKABLE_TAGS));
+ log.info(" ... set chunkable String tags to {}", chunkableTags);
+ tpc.setChunkableTags(chunkableTags);
+ }
}
private static Boolean parseState(Map<String,String> config, String param){
Modified: stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java?rev=1496359&r1=1496358&r2=1496359&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/entitylinking/engine/src/main/java/org/apache/stanbol/enhancer/engines/entitylinking/engine/EntityLinkingEngine.java Tue Jun 25 06:47:12 2013
@@ -261,6 +261,9 @@ public class EntityLinkingEngine impleme
log.error("Unable to link Entities with "+entityLinker,e);
throw new EngineException(this, ci, "Unable to link Entities with "+entityLinker, e);
}
+ if(log.isInfoEnabled()){
+ entityLinker.logStatistics(log);
+ }
//write results (requires a write lock)
ci.getLock().writeLock().lock();
try {