You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/01/20 08:57:36 UTC

svn commit: r1559637 - in /stanbol/branches/release-0.12/enhancement-engines/dereference: core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/ entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/ entityhub/s...

Author: rwesten
Date: Mon Jan 20 07:57:35 2014
New Revision: 1559637

URL: http://svn.apache.org/r1559637
Log:
implementation for STANBOL-1259 for the 0.12 branch. This also adds the new features as configuration options to the EntityhubDereferenceEngine

Modified:
    stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
    stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
    stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
    stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
    stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties

Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java Mon Jan 20 07:57:35 2014
@@ -82,4 +82,32 @@ public interface DereferenceConstants {
      */
     String DEREFERENCE_ENTITIES_LDPATH = "enhancer.engines.dereference.ldpath";
 
+	/**
+	 * A URI prefix checked for entity URIs. Only entities that do match any of the
+	 * parsed prefixes or {@link #URI_PATTERN} will be dereferenced. If no 
+	 * pattern nor prefixes are configured all entities will be dereferenced. 
+	 * This has lower priority as {@link #FALLBACK_MODE}.
+	 * @see #FALLBACK_MODE
+	 */
+    String URI_PREFIX = "enhancer.engines.dereference.uriPrefix";
+    
+    
+	/**
+	 * Regex pattern applied to entity URIs. Only entities that do match any of
+	 * the configured {@link #URI_PREFIX} or pattern will be dereferenced. 
+	 * If no pattern nor prefixes are configured all entities will be dereferenced.
+	 * This has lower priority as {@link #FALLBACK_MODE}.
+	 * @see #FALLBACK_MODE
+	 */
+    String URI_PATTERN = "enhancer.engines.dereference.uriPattern";
+    
+    /**
+     * If fallback mode is activated a dereference engine will not try to
+     * dereference entities for those there are already triples added to the
+     * enhancement results.
+     */
+    String FALLBACK_MODE = "enhancer.engines.dereference.fallback";
+    
+    boolean DEFAULT_FALLBACK_MODE = true;
+    
 }

Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java Mon Jan 20 07:57:35 2014
@@ -213,4 +213,60 @@ public class DereferenceEngineConfig imp
         return config;
     }
     
+    /**
+     * If the {@link DereferenceConstants#FALLBACK_MODE} is active or inactive
+     * @return the fallback mode state
+     */
+    public boolean isFallbackMode(){
+    	Object value = config.get(FALLBACK_MODE);
+    	return value == null ? DereferenceConstants.DEFAULT_FALLBACK_MODE :
+    		Boolean.parseBoolean(value.toString());
+    }
+    
+    /**
+     * The configured {@link DereferenceConstants#URI_PATTERN}
+     * @return the URI patterns. An empty List if none
+     */
+    public List<String> getUriPatterns(){
+    	Object value = config.get(DereferenceConstants.URI_PATTERN);
+    	return getStrValues(value);
+    }
+    /**
+     * The configured {@link DereferenceConstants#URI_PREFIX}
+     * @return the URI prefixes. An empty List if none
+     */
+    public List<String> getUriPrefixes(){
+    	Object value = config.get(DereferenceConstants.URI_PREFIX);
+    	return getStrValues(value);
+    }
+	/**
+	 * Extracts String values from the parsed value.
+	 * @param value the value (String, String[] or Collection<?>
+	 * @return the values as List in the parsed order
+	 */
+	private List<String> getStrValues(Object value) {
+		final List<String> values;
+    	if(value instanceof String){
+    		values = StringUtils.isBlank(((String)value)) ? 
+    				Collections.<String>emptyList() : 
+    					Collections.singletonList((String)value);
+    	} else if(value instanceof String[]){
+    		values = new ArrayList<String>();
+    		for(String pattern : (String[])value){
+    			if(!StringUtils.isBlank(pattern)){
+    				values.add(pattern);
+    			}
+    		}
+    	} else if(value instanceof Collection<?>){
+    		values = new ArrayList<String>();
+    		for(Object pattern : (Collection<?>)value){
+    			if(pattern != null && StringUtils.isBlank(pattern.toString())){
+    				values.add(pattern.toString());
+    			}
+    		}    		
+    	} else {
+    		values = Collections.emptyList();
+    	}
+    	return values;
+	}
 }

Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java Mon Jan 20 07:57:35 2014
@@ -16,6 +16,7 @@
  */
 package org.apache.stanbol.enhancer.engines.dereference;
 
+import static org.apache.stanbol.enhancer.engines.dereference.DereferenceConstants.URI_PATTERN;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
 
@@ -32,6 +33,9 @@ import java.util.concurrent.ExecutionExc
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.locks.Lock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.MGraph;
@@ -74,6 +78,13 @@ public class EntityDereferenceEngine imp
     
     protected final boolean filterAcceptLanguages;
     
+    protected final boolean uriFilterPresent;
+    
+    protected final List<String> prefixList;
+    
+    protected final List<Pattern> patternList;
+    
+    protected final boolean fallbackMode;
     /**
      * The Map holding the {@link #serviceProperties} for this engine.
      */
@@ -90,14 +101,53 @@ public class EntityDereferenceEngine imp
         }
         this.config = config;
         this.name = config.getEngineName();
+        log.debug("create {} name {}", getClass().getSimpleName(), name);
         this.filterContentLanguages = config.isFilterContentLanguages();
+        log.debug(" - filter content languages: {}", filterContentLanguages);
         this.filterAcceptLanguages = config.isFilterAcceptLanguages();
+        log.debug(" - filter Accept languages: {}", filterAcceptLanguages);
         if(dereferencer == null){
             throw new IllegalArgumentException("The parsed EntityDereferencer MUST NOT be NULL!");
         }
         this.dereferencer = dereferencer;
-        //init the defautl ordering
-        setEngineOrdering(DEFAULT_ENGINE_ORDERING);
+        log.debug(" - dereferenced {} (type: {})", dereferencer, dereferencer.getClass().getName());
+        //init the default ordering
+        this.fallbackMode = config.isFallbackMode();
+        log.debug(" - fallback Mode: {}", fallbackMode);
+        //Set the default engine ordering based on the fallback mode state:
+        //in case of fallback mode call this after dereferencing engines 
+        //without fallback mode
+        setEngineOrdering(fallbackMode ? DEFAULT_ENGINE_ORDERING - 1 : 
+        	DEFAULT_ENGINE_ORDERING);
+        log.debug(" - engine order: {}", getEngineOrdering());
+        //sort the prefixes
+        prefixList = config.getUriPrefixes();
+        if(prefixList.size() > 1){
+        	Collections.sort(prefixList);
+        }
+        if(log.isDebugEnabled()){
+        	log.debug(" - configured prefixes:");
+        	for(String prefix : prefixList){
+        		log.debug("     {}",prefix);
+        	}
+        }
+        //compile the patterns
+        patternList = new ArrayList<Pattern>();
+        for(String pattern : config.getUriPatterns()){
+        	try {
+        		patternList.add(Pattern.compile(pattern));
+        	} catch (PatternSyntaxException e){
+        		throw new IllegalStateException("Unable to compile URI pattern '"
+        				+ pattern + "' pared via property '" + URI_PATTERN + "'!");
+        	}
+        }
+        if(log.isDebugEnabled()){
+        	log.debug(" - configured patterns:");
+        	for(Pattern pattern : patternList){
+        		log.debug("     {}",pattern);
+        	}
+        }
+        uriFilterPresent = !prefixList.isEmpty() || !patternList.isEmpty();
     }
     
     /**
@@ -157,6 +207,7 @@ public class EntityDereferenceEngine imp
             return;
         }
         log.debug("> dereference Entities for ContentItem {}", ci.getUri());
+        long start = System.nanoTime();
         final DereferenceContext derefContext = new DereferenceContext(offline);
         Set<String> includedLangs = new HashSet<String>();
         //TODO: parse accept languages as soon as Enhancement properties are implemented
@@ -172,27 +223,27 @@ public class EntityDereferenceEngine imp
                 }
             } //no content language filtering - leave contentLanguages empty
             //parse the referenced entities from the graph
+            Set<UriRef> checked = new HashSet<UriRef>();
             Iterator<Triple> entityReferences = metadata.filter(null, ENHANCER_ENTITY_REFERENCE, null);
             while(entityReferences.hasNext()){
                 Triple triple = entityReferences.next();
                 Resource entityReference = triple.getObject();
-                if(entityReference instanceof UriRef){
+                if((entityReference instanceof UriRef) && //only URIs
+                		checked.add((UriRef)entityReference) && //do not check a URI twice
+                		chekcFallbackMode((UriRef)entityReference, metadata) && //fallback mode
+                		checkURI((UriRef)entityReference)){ //URI prefixes and patterns
                     boolean added = referencedEntities.add((UriRef)entityReference);
                     if(added && log.isTraceEnabled()){
                         log.trace("  ... schedule Entity {}", entityReference);
                     }
-                } else if(log.isWarnEnabled()){
-                    //log enhancement that use a fise:entiy-reference with a non UriRef value!
-                    NonLiteral enhancement = triple.getSubject();
-                    log.warn("Can not dereference invalid Enhancement {}",enhancement);
-                    for(Iterator<Triple> it = metadata.filter(enhancement, null, null);it.hasNext();){
-                        log.warn("   {}", it.next());
-                    }
+                } else if(log.isTraceEnabled()){
+                    log.trace(" ... ignore Entity {}",entityReferences);
                 }
             }
         } finally {
             ci.getLock().readLock().unlock();
         }
+        long schedule = System.nanoTime();
         if(!includedLangs.isEmpty()){
             includedLangs.add(null); //also include literals without language
             //and set the list to the dereference context
@@ -204,7 +255,6 @@ public class EntityDereferenceEngine imp
             referencedEntities.size());
         //(2) dereference the Entities
         ExecutorService executor = dereferencer.getExecutor();
-        long start = System.currentTimeMillis();
         Set<UriRef> failedEntities = new HashSet<UriRef>();
         int dereferencedCount = 0;
         List<DereferenceJob> dereferenceJobs = new ArrayList<DereferenceJob>(
@@ -256,25 +306,99 @@ public class EntityDereferenceEngine imp
                 }
             }
         }
-        long duration = System.currentTimeMillis() - start;
+        long end = System.nanoTime();
+        float sheduleDuration = ((schedule - start)/10000)/100f;
+        float dereferenceDuration = ((end - schedule)/10000)/100f;
+        float duration = ((end - start)/10000)/100f;
         if(!failedEntities.isEmpty()){
             log.warn(" - unable to dereference {} of {} for ContentItem {}",
                 new Object[] {failedEntities.size(),referencedEntities.size(), 
                     ci.getUri()});
         }
         if(log.isDebugEnabled() && dereferencedCount > 0){
-            log.debug(" - dereferenced {} of {} Entities in {}ms ({}ms/dereferenced)", 
-                new Object[]{dereferencedCount, referencedEntities.size(),
-                    duration, (duration*100/dereferencedCount)/100.0f});
+            log.debug(" - dereferenced {} of {} Entities in {}ms | schedule:{}ms | "
+            		+ " dereference: {}ms ({}ms/entity)", new Object[]{
+            				dereferencedCount, referencedEntities.size(),
+            				duration, sheduleDuration, dereferenceDuration,
+            				dereferenceDuration/dereferencedCount});
         }
         
     }
 
-    @Override
+	@Override
     public String getName() {
         return name;
     }
 
+    protected boolean chekcFallbackMode(UriRef entityReference, MGraph metadata) {
+		return fallbackMode ? //in case we use fallback mode
+				//filter entities for those an outgoing relation is present
+				!metadata.filter(entityReference, null, null).hasNext() :
+					true; //otherwise process all entities
+	}
+    /**
+     * Checks if we need to schedule an Entity based on its URI. This uses
+     * configured URI prefixes and URI patterns.
+     * @param entity the entity to check
+     * @return <code>true</code> if this entity should be scheduled for
+     * dereferencing. <code>false</code> if not.
+     */
+    protected boolean checkURI(UriRef entity){
+    	if(!uriFilterPresent){ //if no prefix nor pattern is set
+    		return true; //accept all
+    	}
+    	//first prefixes as this is faster
+    	String entityUri = entity.getUnicodeString();
+    	log.trace(" - checkURI {}", entityUri);
+    	//(1) check against prefixes
+    	if(!prefixList.isEmpty()){
+        	//as we do not want to check with all configured prefixes let us do a
+        	//binary search for the correct one
+	    	int pos = Collections.binarySearch(prefixList, entityUri);
+		    if(pos < 0){
+		        /**
+		         * Example:
+		         * ["a","b"] <- "bc"
+		         * binary search returns -3 (because insert point would be +2)
+		         * to find the prefix we need the insert point-1 -> pos 1
+		         *
+		         * Example2:
+		         * [] <- "bc"
+		         * binary search returns -1 (because insert point would be 0)
+		         * to find the prefix we need the insert point-1 -> pos -1
+		         * therefore we need to check for negative prefixPos and return
+		         * an empty list!
+		         */
+		    	int prefixPos = Math.abs(pos)-2;
+		    	String prefix = prefixList.get(prefixPos);
+		    	if(prefixPos >= 0 && entityUri.startsWith(prefix)){
+		    		log.trace(" ... matched prefix {}", prefix);
+		    		return true; //it matches a prefix in the list
+		    	} else { //try configured regex pattern
+		    		log.trace("  ... no match for prefix {}", prefix);
+		    	}
+		    } else {
+		        return true; //entityUri found in list
+		    }
+    	}
+	    //(2) check against regex
+    	if(!patternList.isEmpty()){
+    		for(Pattern pattern : patternList){
+    			Matcher m = pattern.matcher(entityUri);
+    			if(m.find()){
+    				if(log.isTraceEnabled()) {
+    					log.trace("  ... matches pattern {}", pattern);
+    				}
+    				return true;
+    			} else if(log.isTraceEnabled()){ //try the next pattern
+					log.trace("  ... no match for pattern {}", pattern);
+    			}
+    		}
+    	}
+    	return false; //no match
+    }
+    
+    
     /**
      * Used both as {@link Callable} submitted to the {@link ExecutorService}
      * and as object to {@link #await()} the completion of the task.

Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java Mon Jan 20 07:57:35 2014
@@ -63,8 +63,12 @@ import org.slf4j.LoggerFactory;
 @org.apache.felix.scr.annotations.Properties(value={
     @Property(name=PROPERTY_NAME),
     @Property(name=EntityhubDereferenceEngine.SITE_ID),
+    @Property(name=DereferenceConstants.FALLBACK_MODE, 
+    	boolValue=DereferenceConstants.DEFAULT_FALLBACK_MODE),
+    @Property(name=DereferenceConstants.URI_PREFIX, cardinality=Integer.MAX_VALUE),
+    @Property(name=DereferenceConstants.URI_PATTERN, cardinality=Integer.MAX_VALUE),
     @Property(name=DereferenceConstants.FILTER_CONTENT_LANGUAGES, 
-    boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
+    	boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
     @Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
     	value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
     @Property(name=DEREFERENCE_ENTITIES_LDPATH, cardinality=Integer.MAX_VALUE),

Modified: stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Jan 20 07:57:35 2014
@@ -53,3 +53,23 @@ enhancer.engine.dereference.filterConten
 enhancer.engine.dereference.filterContentlanguages.description=If enabled only Literals \
 in the language detected for the parsed Content (or with no defined language) are dereferenced
 
+
+enhancer.engines.dereference.fallback.name=Fallback Mode
+enhancer.engines.dereference.fallback.description=If enabled the engine will only \
+try to dereference Entities for those no data where yet added to the Enhancement Results
+
+enhancer.engines.dereference.uriPrefix.name= URI Prefix
+enhancer.engines.dereference.uriPrefix.description=URI prefixes such as \
+'http://http://rdf.freebase.com/ns/' this engine will try to resolve. NOTE: that \
+his engine will use both the configured URI prefixes AND URI patterns to check \
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+
+enhancer.engines.dereference.uriPattern.name=URI Pattern
+enhancer.engines.dereference.uriPattern.description=Regex pattern matched URI \
+against URIs (e.g. '^http://(\w+\.)?dbpedia\.org/resource/.*' would match \
+dbpedia.org Resources regardless of the language). NOTE: that \
+his engine will use both the configured URI prefixes AND URI patterns to check \
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+