You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/09/10 16:29:11 UTC

svn commit: r1624013 - in /stanbol/trunk: enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/ enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/ ...

Author: rwesten
Date: Wed Sep 10 14:29:11 2014
New Revision: 1624013

URL: http://svn.apache.org/r1624013
Log:
merged implementation for STANBOL-1391 from 0.12.1 to trunk

Modified:
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
    stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Wed Sep 10 14:29:11 2014
@@ -87,6 +87,8 @@ public class FstLinkingEngine implements
 
     private static final UriRef ENHANCER_ENTITY_RANKING = new UriRef(NamespaceEnum.fise + "entity-ranking");
 
+    public static final UriRef FISE_ORIGIN = new UriRef(NamespaceEnum.fise + "origin");
+
     private final LiteralFactory literalFactory = LiteralFactory.getInstance();
     
     protected final String name;
@@ -579,6 +581,10 @@ public class FstLinkingEngine implements
                     Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
                 //add the relation to the fise:TextAnnotation (the tag)
                 metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
+                //write origin information
+                if(indexConfig.getOrigin() != null){
+                    metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
+                }
                 //TODO: add origin information of the EntiySearcher
 //                for(Entry<UriRef,Collection<Resource>> originInfo : entitySearcher.getOriginInformation().entrySet()){
 //                    for(Resource value : originInfo.getValue()){

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Wed Sep 10 14:29:11 2014
@@ -33,6 +33,8 @@ import static org.osgi.framework.Constan
 
 import java.io.File;
 import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.Arrays;
 import java.util.Dictionary;
 import java.util.HashMap;
@@ -44,6 +46,10 @@ import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.filefilter.WildcardFileFilter;
@@ -188,6 +194,16 @@ public class FstLinkingEngineComponent {
     public static final String SOLR_CORE = "enhancer.engines.linking.lucenefst.solrcore";
     
     /**
+     * The origin information for all Entities provided by the configured SolrCore and
+     * FST. Origin information are added to all <code>fise:EntityAnnotation</code>
+     * by using the <code>fise:origin</code> property. Configured values can be both
+     * {@link UriRef URI}s or {@link Literal}s. Configured Strings are checked if
+     * they are valid {@link URI}s and  {@link URI#isAbsolute() absolute}. If not
+     * a {@link Literal} is parsed.
+     */
+    public static final String ORIGIN = "enhancer.engines.linking.lucenefst.origin";
+    
+    /**
      * The size of the thread pool used to create FST models (default=1). Creating
      * such models does need a lot of memory. Expect values up to 10times of the
      * build model. So while this task can easily performed concurrently users need
@@ -231,6 +247,11 @@ public class FstLinkingEngineComponent {
     private String engineName;
     
     /**
+     * The origin information of Entities.
+     */
+    private Resource origin;
+    
+    /**
      * used to resolve '{prefix}:{local-name}' used within the engines configuration
      */
     @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
@@ -391,7 +412,29 @@ public class FstLinkingEngineComponent {
             skipAltTokensConfig = Boolean.valueOf(value.toString());
         } // else no config -> will use the default
         
-        //(4) init the FST configuration
+        //(4) parse Origin information
+        value = properties.get(ORIGIN);
+        if(value instanceof Resource){
+            origin = (Resource)origin;
+        } else if (value instanceof String){
+            try {
+                URI originUri = new URI((String)value);
+                if(originUri.isAbsolute()){
+                    origin = new UriRef((String)value);
+                } else {
+                    origin = new PlainLiteralImpl((String)value);
+                }
+            } catch(URISyntaxException e){
+                origin = new PlainLiteralImpl((String)value);
+            }
+            log.info(" - origin: {}", origin);
+        } else if(value != null){
+            log.warn("Values of the {} property MUST BE of type Resource or String "
+                    + "(parsed: {} (type:{}))", new Object[]{ORIGIN,value,value.getClass()});
+        } //else no ORIGIN information provided
+        
+        
+        //(5) init the FST configuration
         //We can create the default configuration only here, as it depends on the
         //name of the solrIndex
         String defaultConfig = "*;" 
@@ -417,7 +460,7 @@ public class FstLinkingEngineComponent {
                 + "(found: "+value.getClass().getName()+")!");
         }
         
-        //(5) Create the ThreadPool used for the runtime creation of FST models
+        //(6) Create the ThreadPool used for the runtime creation of FST models
         value = properties.get(FST_THREAD_POOL_SIZE);
         int tpSize;
         if(value instanceof Number){
@@ -457,7 +500,7 @@ public class FstLinkingEngineComponent {
         }
         fstCreatorService = Executors.newFixedThreadPool(tpSize,tfBuilder.build());
         
-        //(6) Parse the EntityCache config
+        //(7) Parse the EntityCache config
         int entityCacheSize;
         value = properties.get(ENTITY_CACHE_SIZE);
         if(value instanceof Number){
@@ -482,14 +525,14 @@ public class FstLinkingEngineComponent {
         	log.info(" ... EntityCache enabled (size: {})",this.entityCacheSize);
         }
         
-        //(7) parse the Entity type field
+        //(8) parse the Entity type field
         value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
         if(value == null || StringUtils.isBlank(value.toString())){
             solrTypeField = null;
         } else {
             solrTypeField = value.toString().trim();
         }
-        //(8) parse the Entity Ranking field
+        //(9) parse the Entity Ranking field
         value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
         if(value == null){
             solrRankingField = null;
@@ -497,7 +540,7 @@ public class FstLinkingEngineComponent {
             solrRankingField = value.toString().trim();
         }
         
-        //(9) start tracking the SolrCore
+        //(10) start tracking the SolrCore
         try {
             solrServerTracker = new RegisteredSolrServerTracker(
                 bundleContext, indexReference, null){
@@ -588,6 +631,7 @@ public class FstLinkingEngineComponent {
                 //set fields parsed in the activate method
                 indexConfig.setExecutorService(fstCreatorService);
                 indexConfig.setRedirectField(null);//TODO add support
+                indexConfig.setOrigin(origin);
                 //NOTE: the FST cofnig is processed even if the SolrCore has not changed
                 //      because their might be config changes and/or new FST files in the
                 //      FST directory of the SolrCore.

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java Wed Sep 10 14:29:11 2014
@@ -27,6 +27,9 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.filefilter.WildcardFileFilter;
@@ -105,6 +108,15 @@ public class IndexConfiguration {
     private boolean active = false;
 
     private File fstDirectory;
+    
+    /**
+     * The origin is added to <code>fise:TextAnnotation</code> created for
+     * linked Entities. It is intended to be used for providing a reference to
+     * dataset of the Entity. Both {@link UriRef URI}s and {@link Literal}s can
+     * be used here
+     */
+    private Resource origin;
+
     /**
      * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
      * tokens should cause an {@link UnsupportedTokenException}.
@@ -363,6 +375,22 @@ public class IndexConfiguration {
         this.fstDirectory = fstDirectory;
     }
 
+    public void setOrigin(Resource origin) {
+        this.origin = origin;
+    }
+    /**
+     * The Origin of the dataset or <code>null</code> if not defined. The
+     * origin can be used to specify the dataset where the Entities described by
+     * the configured FST originate from. If can be both an URI (e.g. 
+     * <code>http://dbpedia.org</code>) or an literal "<code>dbpedia</code>").
+     * If present the origin is added to any <code>fise:TextAnnotation</code>
+     * created by the FstLinkingEngine with the property <code>fise:origin</code>
+     * 
+     * @return the origin or <code>null</code> if none is configured
+     */
+    public Resource getOrigin() {
+        return origin;
+    }
     
     /**
      * Deactivates this {@link IndexConfiguration}

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java Wed Sep 10 14:29:11 2014
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
 import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
+import static org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine.FISE_ORIGIN;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
@@ -122,6 +123,7 @@ public class FstLinkingEngineTest {
      */
     public static final String TEST_YARD_ID = "dbpedia";
     public static final String TEST_SOLR_CORE_NAME = "dbpedia";
+    public static final String TEST_ORIGIN = "texst.origin";
     public static final String TEST_SOLR_CORE_CONFIGURATION = "dbpedia_26k.solrindex.bz2";
     protected static final String TEST_INDEX_REL_PATH = File.separatorChar + "target" + File.separatorChar
                                                         + ManagedSolrServer.DEFAULT_SOLR_DATA_DIR;
@@ -199,6 +201,7 @@ public class FstLinkingEngineTest {
         fstConfig.setTypeField("rdf:type");
         fstConfig.setRankingField("entityhub:entityRank");
         //fstConfig.setEntityCacheManager(new FastLRUCacheManager(2048));
+        fstConfig.setOrigin(new PlainLiteralImpl(TEST_ORIGIN));
         //activate the FST config
         fstConfig.activate(); //activate this configuration
         
@@ -384,6 +387,10 @@ public class FstLinkingEngineTest {
             if(suggestedEntities.remove(entityUri.getUnicodeString())){
                 log.info(" ... found");
             }
+            //assert origin
+            assertEquals(TEST_ORIGIN, EnhancementEngineHelper.getString(
+                ci.getMetadata(),entityAnnotation, FISE_ORIGIN));
+            
 //            Assert.assertTrue("fise:referenced-entity " + entityUri +
 //                " not expected (expected: "+expectedEntities+")",
 //                suggestedEntities.remove(entityUri.getUnicodeString()) || 

Modified: stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java (original)
+++ stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java Wed Sep 10 14:29:11 2014
@@ -222,6 +222,16 @@ public final class Properties {
             NamespaceEnum.fise + "confidence-level");
 
     /**
+     * The origin can be used to reference the vocabulary (dataset, thesaurus, 
+     * ontology, ...) the Entity {@link #ENHANCER_ENTITY_REFERENCE referenced}
+     * by a <code>{@link TechnicalClasses#ENHANCER_ENTITYANNOTATION fise:EntiyAnnotation}</code>
+     * originates from.
+     * @since 0.12.1 (STANBOL-????)
+     */
+    public static final UriRef ENHANCER_ORIGIN = new UriRef(
+            NamespaceEnum.fise + "origin");
+    
+    /**
      * Internet Media Type of a content item.
      * 
      * @deprecated dc:FileFormat does not exist