You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/20 08:57:36 UTC
svn commit: r1559637 - in
/stanbol/branches/release-0.12/enhancement-engines/dereference:
core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/
entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/
entityhub/s...
Author: rwesten
Date: Mon Jan 20 07:57:35 2014
New Revision: 1559637
URL: http://svn.apache.org/r1559637
Log:
implementation for STANBOL-1259 for the 0.12 branch. This also adds the new features as configuration options to the EntityhubDereferenceEngine
Modified:
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java Mon Jan 20 07:57:35 2014
@@ -82,4 +82,32 @@ public interface DereferenceConstants {
*/
String DEREFERENCE_ENTITIES_LDPATH = "enhancer.engines.dereference.ldpath";
+ /**
+ * A URI prefix checked for entity URIs. Only entities that do match any of the
+ * parsed prefixes or {@link #URI_PATTERN} will be dereferenced. If no
+ * pattern nor prefixes are configured all entities will be dereferenced.
+ * This has lower priority as {@link #FALLBACK_MODE}.
+ * @see #FALLBACK_MODE
+ */
+ String URI_PREFIX = "enhancer.engines.dereference.uriPrefix";
+
+
+ /**
+ * Regex pattern applied to entity URIs. Only entities that do match any of
+ * the configured {@link #URI_PREFIX} or pattern will be dereferenced.
+ * If no pattern nor prefixes are configured all entities will be dereferenced.
+ * This has lower priority as {@link #FALLBACK_MODE}.
+ * @see #FALLBACK_MODE
+ */
+ String URI_PATTERN = "enhancer.engines.dereference.uriPattern";
+
+ /**
+ * If fallback mode is activated a dereference engine will not try to
+ * dereference entities for those there are already triples added to the
+ * enhancement results.
+ */
+ String FALLBACK_MODE = "enhancer.engines.dereference.fallback";
+
+ boolean DEFAULT_FALLBACK_MODE = true;
+
}
Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java Mon Jan 20 07:57:35 2014
@@ -213,4 +213,60 @@ public class DereferenceEngineConfig imp
return config;
}
+ /**
+ * If the {@link DereferenceConstants#FALLBACK_MODE} is active or inactive
+ * @return the fallback mode state
+ */
+ public boolean isFallbackMode(){
+ Object value = config.get(FALLBACK_MODE);
+ return value == null ? DereferenceConstants.DEFAULT_FALLBACK_MODE :
+ Boolean.parseBoolean(value.toString());
+ }
+
+ /**
+ * The configured {@link DereferenceConstants#URI_PATTERN}
+ * @return the URI patterns. An empty List if none
+ */
+ public List<String> getUriPatterns(){
+ Object value = config.get(DereferenceConstants.URI_PATTERN);
+ return getStrValues(value);
+ }
+ /**
+ * The configured {@link DereferenceConstants#URI_PREFIX}
+ * @return the URI prefixes. An empty List if none
+ */
+ public List<String> getUriPrefixes(){
+ Object value = config.get(DereferenceConstants.URI_PREFIX);
+ return getStrValues(value);
+ }
+ /**
+ * Extracts String values from the parsed value.
+ * @param value the value (String, String[] or Collection<?>
+ * @return the values as List in the parsed order
+ */
+ private List<String> getStrValues(Object value) {
+ final List<String> values;
+ if(value instanceof String){
+ values = StringUtils.isBlank(((String)value)) ?
+ Collections.<String>emptyList() :
+ Collections.singletonList((String)value);
+ } else if(value instanceof String[]){
+ values = new ArrayList<String>();
+ for(String pattern : (String[])value){
+ if(!StringUtils.isBlank(pattern)){
+ values.add(pattern);
+ }
+ }
+ } else if(value instanceof Collection<?>){
+ values = new ArrayList<String>();
+ for(Object pattern : (Collection<?>)value){
+ if(pattern != null && StringUtils.isBlank(pattern.toString())){
+ values.add(pattern.toString());
+ }
+ }
+ } else {
+ values = Collections.emptyList();
+ }
+ return values;
+ }
}
Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java Mon Jan 20 07:57:35 2014
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.dereference;
+import static org.apache.stanbol.enhancer.engines.dereference.DereferenceConstants.URI_PATTERN;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
@@ -32,6 +33,9 @@ import java.util.concurrent.ExecutionExc
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.locks.Lock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.MGraph;
@@ -74,6 +78,13 @@ public class EntityDereferenceEngine imp
protected final boolean filterAcceptLanguages;
+ protected final boolean uriFilterPresent;
+
+ protected final List<String> prefixList;
+
+ protected final List<Pattern> patternList;
+
+ protected final boolean fallbackMode;
/**
* The Map holding the {@link #serviceProperties} for this engine.
*/
@@ -90,14 +101,53 @@ public class EntityDereferenceEngine imp
}
this.config = config;
this.name = config.getEngineName();
+ log.debug("create {} name {}", getClass().getSimpleName(), name);
this.filterContentLanguages = config.isFilterContentLanguages();
+ log.debug(" - filter content languages: {}", filterContentLanguages);
this.filterAcceptLanguages = config.isFilterAcceptLanguages();
+ log.debug(" - filter Accept languages: {}", filterAcceptLanguages);
if(dereferencer == null){
throw new IllegalArgumentException("The parsed EntityDereferencer MUST NOT be NULL!");
}
this.dereferencer = dereferencer;
- //init the defautl ordering
- setEngineOrdering(DEFAULT_ENGINE_ORDERING);
+ log.debug(" - dereferenced {} (type: {})", dereferencer, dereferencer.getClass().getName());
+ //init the default ordering
+ this.fallbackMode = config.isFallbackMode();
+ log.debug(" - fallback Mode: {}", fallbackMode);
+ //Set the default engine ordering based on the fallback mode state:
+ //in case of fallback mode call this after dereferencing engines
+ //without fallback mode
+ setEngineOrdering(fallbackMode ? DEFAULT_ENGINE_ORDERING - 1 :
+ DEFAULT_ENGINE_ORDERING);
+ log.debug(" - engine order: {}", getEngineOrdering());
+ //sort the prefixes
+ prefixList = config.getUriPrefixes();
+ if(prefixList.size() > 1){
+ Collections.sort(prefixList);
+ }
+ if(log.isDebugEnabled()){
+ log.debug(" - configured prefixes:");
+ for(String prefix : prefixList){
+ log.debug(" {}",prefix);
+ }
+ }
+ //compile the patterns
+ patternList = new ArrayList<Pattern>();
+ for(String pattern : config.getUriPatterns()){
+ try {
+ patternList.add(Pattern.compile(pattern));
+ } catch (PatternSyntaxException e){
+ throw new IllegalStateException("Unable to compile URI pattern '"
+ + pattern + "' pared via property '" + URI_PATTERN + "'!");
+ }
+ }
+ if(log.isDebugEnabled()){
+ log.debug(" - configured patterns:");
+ for(Pattern pattern : patternList){
+ log.debug(" {}",pattern);
+ }
+ }
+ uriFilterPresent = !prefixList.isEmpty() || !patternList.isEmpty();
}
/**
@@ -157,6 +207,7 @@ public class EntityDereferenceEngine imp
return;
}
log.debug("> dereference Entities for ContentItem {}", ci.getUri());
+ long start = System.nanoTime();
final DereferenceContext derefContext = new DereferenceContext(offline);
Set<String> includedLangs = new HashSet<String>();
//TODO: parse accept languages as soon as Enhancement properties are implemented
@@ -172,27 +223,27 @@ public class EntityDereferenceEngine imp
}
} //no content language filtering - leave contentLanguages empty
//parse the referenced entities from the graph
+ Set<UriRef> checked = new HashSet<UriRef>();
Iterator<Triple> entityReferences = metadata.filter(null, ENHANCER_ENTITY_REFERENCE, null);
while(entityReferences.hasNext()){
Triple triple = entityReferences.next();
Resource entityReference = triple.getObject();
- if(entityReference instanceof UriRef){
+ if((entityReference instanceof UriRef) && //only URIs
+ checked.add((UriRef)entityReference) && //do not check a URI twice
+ chekcFallbackMode((UriRef)entityReference, metadata) && //fallback mode
+ checkURI((UriRef)entityReference)){ //URI prefixes and patterns
boolean added = referencedEntities.add((UriRef)entityReference);
if(added && log.isTraceEnabled()){
log.trace(" ... schedule Entity {}", entityReference);
}
- } else if(log.isWarnEnabled()){
- //log enhancement that use a fise:entiy-reference with a non UriRef value!
- NonLiteral enhancement = triple.getSubject();
- log.warn("Can not dereference invalid Enhancement {}",enhancement);
- for(Iterator<Triple> it = metadata.filter(enhancement, null, null);it.hasNext();){
- log.warn(" {}", it.next());
- }
+ } else if(log.isTraceEnabled()){
+ log.trace(" ... ignore Entity {}",entityReferences);
}
}
} finally {
ci.getLock().readLock().unlock();
}
+ long schedule = System.nanoTime();
if(!includedLangs.isEmpty()){
includedLangs.add(null); //also include literals without language
//and set the list to the dereference context
@@ -204,7 +255,6 @@ public class EntityDereferenceEngine imp
referencedEntities.size());
//(2) dereference the Entities
ExecutorService executor = dereferencer.getExecutor();
- long start = System.currentTimeMillis();
Set<UriRef> failedEntities = new HashSet<UriRef>();
int dereferencedCount = 0;
List<DereferenceJob> dereferenceJobs = new ArrayList<DereferenceJob>(
@@ -256,25 +306,99 @@ public class EntityDereferenceEngine imp
}
}
}
- long duration = System.currentTimeMillis() - start;
+ long end = System.nanoTime();
+ float sheduleDuration = ((schedule - start)/10000)/100f;
+ float dereferenceDuration = ((end - schedule)/10000)/100f;
+ float duration = ((end - start)/10000)/100f;
if(!failedEntities.isEmpty()){
log.warn(" - unable to dereference {} of {} for ContentItem {}",
new Object[] {failedEntities.size(),referencedEntities.size(),
ci.getUri()});
}
if(log.isDebugEnabled() && dereferencedCount > 0){
- log.debug(" - dereferenced {} of {} Entities in {}ms ({}ms/dereferenced)",
- new Object[]{dereferencedCount, referencedEntities.size(),
- duration, (duration*100/dereferencedCount)/100.0f});
+ log.debug(" - dereferenced {} of {} Entities in {}ms | schedule:{}ms | "
+ + " dereference: {}ms ({}ms/entity)", new Object[]{
+ dereferencedCount, referencedEntities.size(),
+ duration, sheduleDuration, dereferenceDuration,
+ dereferenceDuration/dereferencedCount});
}
}
- @Override
+ @Override
public String getName() {
return name;
}
+ protected boolean chekcFallbackMode(UriRef entityReference, MGraph metadata) {
+ return fallbackMode ? //in case we use fallback mode
+ //filter entities for those an outgoing relation is present
+ !metadata.filter(entityReference, null, null).hasNext() :
+ true; //otherwise process all entities
+ }
+ /**
+ * Checks if we need to schedule an Entity based on its URI. This uses
+ * configured URI prefixes and URI patterns.
+ * @param entity the entity to check
+ * @return <code>true</code> if this entity should be scheduled for
+ * dereferencing. <code>false</code> if not.
+ */
+ protected boolean checkURI(UriRef entity){
+ if(!uriFilterPresent){ //if no prefix nor pattern is set
+ return true; //accept all
+ }
+ //first prefixes as this is faster
+ String entityUri = entity.getUnicodeString();
+ log.trace(" - checkURI {}", entityUri);
+ //(1) check against prefixes
+ if(!prefixList.isEmpty()){
+ //as we do not want to check with all configured prefixes let us do a
+ //binary search for the correct one
+ int pos = Collections.binarySearch(prefixList, entityUri);
+ if(pos < 0){
+ /**
+ * Example:
+ * ["a","b"] <- "bc"
+ * binary search returns -3 (because insert point would be +2)
+ * to find the prefix we need the insert point-1 -> pos 1
+ *
+ * Example2:
+ * [] <- "bc"
+ * binary search returns -1 (because insert point would be 0)
+ * to find the prefix we need the insert point-1 -> pos -1
+ * therefore we need to check for negative prefixPos and return
+ * an empty list!
+ */
+ int prefixPos = Math.abs(pos)-2;
+ String prefix = prefixList.get(prefixPos);
+ if(prefixPos >= 0 && entityUri.startsWith(prefix)){
+ log.trace(" ... matched prefix {}", prefix);
+ return true; //it matches a prefix in the list
+ } else { //try configured regex pattern
+ log.trace(" ... no match for prefix {}", prefix);
+ }
+ } else {
+ return true; //entityUri found in list
+ }
+ }
+ //(2) check against regex
+ if(!patternList.isEmpty()){
+ for(Pattern pattern : patternList){
+ Matcher m = pattern.matcher(entityUri);
+ if(m.find()){
+ if(log.isTraceEnabled()) {
+ log.trace(" ... matches pattern {}", pattern);
+ }
+ return true;
+ } else if(log.isTraceEnabled()){ //try the next pattern
+ log.trace(" ... no match for pattern {}", pattern);
+ }
+ }
+ }
+ return false; //no match
+ }
+
+
/**
* Used both as {@link Callable} submitted to the {@link ExecutorService}
* and as object to {@link #await()} the completion of the task.
Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java Mon Jan 20 07:57:35 2014
@@ -63,8 +63,12 @@ import org.slf4j.LoggerFactory;
@org.apache.felix.scr.annotations.Properties(value={
@Property(name=PROPERTY_NAME),
@Property(name=EntityhubDereferenceEngine.SITE_ID),
+ @Property(name=DereferenceConstants.FALLBACK_MODE,
+ boolValue=DereferenceConstants.DEFAULT_FALLBACK_MODE),
+ @Property(name=DereferenceConstants.URI_PREFIX, cardinality=Integer.MAX_VALUE),
+ @Property(name=DereferenceConstants.URI_PATTERN, cardinality=Integer.MAX_VALUE),
@Property(name=DereferenceConstants.FILTER_CONTENT_LANGUAGES,
- boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
+ boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
@Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
@Property(name=DEREFERENCE_ENTITIES_LDPATH, cardinality=Integer.MAX_VALUE),
Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Jan 20 07:57:35 2014
@@ -53,3 +53,23 @@ enhancer.engine.dereference.filterConten
enhancer.engine.dereference.filterContentlanguages.description=If enabled only Literals \
in the language detected for the parsed Content (or with no defined language) are dereferenced
+
+enhancer.engines.dereference.fallback.name=Fallback Mode
+enhancer.engines.dereference.fallback.description=If enabled the engine will only \
+try to dereference Entities for those no data where yet added to the Enhancement Results
+
+enhancer.engines.dereference.uriPrefix.name= URI Prefix
+enhancer.engines.dereference.uriPrefix.description=URI prefixes such as \
+'http://http://rdf.freebase.com/ns/' this engine will try to resolve. NOTE: that \
+his engine will use both the configured URI prefixes AND URI patterns to check \
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+
+enhancer.engines.dereference.uriPattern.name=URI Pattern
+enhancer.engines.dereference.uriPattern.description=Regex pattern matched URI \
+against URIs (e.g. '^http://(\w+\.)?dbpedia\.org/resource/.*' would match \
+dbpedia.org Resources regardless of the language). NOTE: that \
+his engine will use both the configured URI prefixes AND URI patterns to check \
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+