You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/03/28 13:02:23 UTC

svn commit: r1306251 - in /incubator/stanbol/trunk/entityhub: indexing/core/ indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/ indexing/genericrdf/ indexing/genericrdf/src/main/resources/indexing/config/ ldpath/src/main/...

Author: rwesten
Date: Wed Mar 28 11:02:23 2012
New Revision: 1306251

URL: http://svn.apache.org/viewvc?rev=1306251&view=rev
Log:
STANBOL-556: Adds support for LDpath processor to the Entityhub Indexing tools

* added a SingleRepresentationBackend (LDPath Backend for a single Representation)
* added the processor implementation
* added documentation for the new processsor to the indexing.properties of the genericrdf indexer


other:

* improved documentation for the EntiyTypeFilter
* added a default configuration for the EntityTypeFileter

Added:
    incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java   (with props)
    incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties   (with props)
    incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java   (with props)
Modified:
    incubator/stanbol/trunk/entityhub/indexing/core/pom.xml
    incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml
    incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/indexing.properties
    incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/EntityhubLDPath.java
    incubator/stanbol/trunk/entityhub/ldpath/src/test/java/org/apache/stanbol/entityhub/ldpath/backend/BackendTest.java

Modified: incubator/stanbol/trunk/entityhub/indexing/core/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/core/pom.xml?rev=1306251&r1=1306250&r2=1306251&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/core/pom.xml (original)
+++ incubator/stanbol/trunk/entityhub/indexing/core/pom.xml Wed Mar 28 11:02:23 2012
@@ -98,6 +98,10 @@
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.entityhub.core</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.entityhub.ldpath</artifactId>
+    </dependency>
     <dependency> <!-- for the Main (command line utility) -->
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>

Added: incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java?rev=1306251&view=auto
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java (added)
+++ incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java Wed Mar 28 11:02:23 2012
@@ -0,0 +1,139 @@
+package org.apache.stanbol.entityhub.indexing.core.processor;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory;
+import org.apache.stanbol.entityhub.indexing.core.EntityProcessor;
+import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
+import org.apache.stanbol.entityhub.ldpath.EntityhubLDPath;
+import org.apache.stanbol.entityhub.ldpath.backend.SingleRepresentationBackend;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import at.newmedialab.ldpath.exception.LDPathParseException;
+import at.newmedialab.ldpath.model.programs.Program;
+
+public class LdpathProcessor implements EntityProcessor{
+
+    private final Logger log = LoggerFactory.getLogger(LdpathProcessor.class);
+    /**
+     * The reference to the file containing the LDPath statement used for
+     * processing. The path is evaluated relative to the config directory
+     * of the indexing
+     */
+    public static final String PARAMETER_LD_PATH = "ldpath";
+    /**
+     * If results of the LDPath transformation are appended to the incoming
+     * representation, or if the incoming Representation is replaced by the
+     * results of the LDPath program (default is append).
+     */
+    public static final String PARAMETER_APPEND = "append";
+    /**
+     * By default appending of LDPath results to the parsed Representation is
+     * activeted
+     */
+    private static final boolean DEFAULT_APPEND_MODE = true;
+
+    private final ValueFactory vf;
+    private final EntityhubLDPath ldPath;
+    private final SingleRepresentationBackend backend;
+    private Program<Object> program;
+    private boolean appendMode;
+    
+    public LdpathProcessor(){
+        vf = InMemoryValueFactory.getInstance();
+        this.backend = new SingleRepresentationBackend(vf);
+        this.ldPath = new EntityhubLDPath(backend);
+    }
+    
+    @Override
+    public boolean needsInitialisation() {
+        return false;
+    }
+
+    @Override
+    public void initialise() {
+    }
+
+    @Override
+    public void close() {
+    }
+
+    @Override
+    public Representation process(Representation source) {
+        if(source == null){
+            return null;
+        }
+        backend.setRepresentation(source);
+        Representation result = ldPath.execute(vf.createReference(source.getId()), program);
+        if(appendMode){
+            Iterator<String> fields = result.getFieldNames();
+            while(fields.hasNext()){
+                String field = fields.next();
+                source.add(field, result.get(field));
+            }
+            return source;
+        } else {
+            return result;
+        }
+    }
+
+    
+    @Override
+    public void setConfiguration(Map<String,Object> config) {
+        IndexingConfig indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
+        //parse the ldpath
+        final File ldpathFile;
+        Object value = config.get(PARAMETER_LD_PATH);
+        if(value != null && !value.toString().isEmpty()){
+            ldpathFile = indexingConfig.getConfigFile(value.toString());
+            if(ldpathFile == null || !ldpathFile.exists()){
+                throw new IllegalArgumentException("Configured '"
+                        + PARAMETER_LD_PATH +"' file was not found!");
+            }
+            if(!ldpathFile.isFile()){
+                throw new IllegalArgumentException("Configured '"
+                        + PARAMETER_LD_PATH +"' file exists but is not a File!");
+            }
+        } else {
+            throw new IllegalArgumentException("Missing required configuration '"
+                + PARAMETER_LD_PATH +"' - the file containing the LDPath program used by this "
+                + LdpathProcessor.class.getSimpleName()+"!");
+        }
+        Reader in = null;
+        try {
+            in = new InputStreamReader(new FileInputStream(ldpathFile), Charset.forName("UTF-8"));
+            this.program = ldPath.parseProgram(in);
+            log.info("ldpath program: \n{}\n",program.getPathExpression(backend));
+        } catch (IOException e) {
+            throw new IllegalStateException("Unabwle to read LDPath program from configured file '"
+                + ldpathFile +"'!",e);
+        } catch (LDPathParseException e) {
+            throw new IllegalStateException("Unable to parse LDPath program from configured file '"
+                    + ldpathFile +"'!",e);
+        } finally {
+            IOUtils.closeQuietly(in);
+        }
+        value = config.get(PARAMETER_APPEND);
+        if(value instanceof Boolean){
+            this.appendMode = ((Boolean) value).booleanValue();
+        } else if(value != null && !value.toString().isEmpty()){
+            this.appendMode = Boolean.parseBoolean(value.toString());
+        } else {
+            this.appendMode = DEFAULT_APPEND_MODE;
+        }
+    }
+
+    
+}

Propchange: incubator/stanbol/trunk/entityhub/indexing/core/src/main/java/org/apache/stanbol/entityhub/indexing/core/processor/LdpathProcessor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml?rev=1306251&r1=1306250&r2=1306251&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml (original)
+++ incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml Wed Mar 28 11:02:23 2012
@@ -328,11 +328,11 @@
 			<artifactId>commons-fileupload</artifactId>
 			<scope>runtime</scope>
 		</dependency>
-		<dependency>
+		<!-- already included <dependency>
 			<groupId>commons-io</groupId>
 			<artifactId>commons-io</artifactId>
 			<scope>runtime</scope>
-		</dependency>
+		</dependency>  -->
 		<dependency>
 			<groupId>commons-lang</groupId>
 			<artifactId>commons-lang</artifactId>

Added: incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties?rev=1306251&view=auto
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties (added)
+++ incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties Wed Mar 28 11:02:23 2012
@@ -0,0 +1,26 @@
+#Configuration for the FieldValueFilter
+
+#This can be used to configure specific rdf:types to be indexed. Entities with
+#other types will be filtered and not be included in the local DBpedia.org
+#index
+
+#How to configure
+
+#The key 'field' can be used to configure the field the filters are applied
+# -  'rdf:type' is used as default for the field
+# -  Only a single field is supported. However one can configure multiple instances
+#    with different configurations in the 'indexing.properties' file.
+# -  It is possible to use a full URI or prefix:localname for all prefixes registered
+#    in 'org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum'
+
+#field=rdf:type
+
+#The key 'values' is used to specify the filter
+# - This is a required configuration.
+# - The value '*' deactivates filtering
+# - Multiple types are supported. Configurations are separated by ';'
+# - It is possible to use full URIs are prefix:local name for all prefixes registered
+#   in 'org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum'
+
+#This deactivate filtering
+values=*
\ No newline at end of file

Propchange: incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/entityTypes.properties
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/indexing.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/indexing.properties?rev=1306251&r1=1306250&r2=1306251&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/indexing.properties (original)
+++ incubator/stanbol/trunk/entityhub/indexing/genericrdf/src/main/resources/indexing/config/indexing.properties Wed Mar 28 11:02:23 2012
@@ -91,17 +91,63 @@ entityScoreProvider=org.apache.stanbol.e
 
 # Entity Processor
 
-# Currently the only available implementation is the FiledMapperProcessor.
-entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.FiledMapperProcessor
+# Multiple Entity processors can be used for indexing entities. The are separated by ';'
+# and are executed in the order of definition.
+
+# FiledMapperProcessor:
+# 
+# entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.FiledMapperProcessor
+#
+# This processor ensures that "field mappings" are executed while indexing entities.
 # By default it will use the mappings configured by the "fieldConfiguraton"
 # property. To use other mappings one can use the "mappings" parameter (e.g.
 # mappings:otherMappings.txt
 
-# A default mapping configuration is provided. This file also includes a lot of
-# comments.
+# FieldValueFilter
+# 
+#entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.FieldValueFilter,config:entityTypes
+# 
+# This allows to define a field and values that are used to filter entities. Only Entities
+# that do have one of the defined values as actual value of the defined field will
+# get indexed. This is typically used to filter entities by rdf:type, but can be used
+# for any URI property. See the default entityTypes.properties file for more information
+
+# LdpathProcessor
+# 
+# This allows to use simple LDpath statements to process entities. Such as mapping
+# only properties of entities with a specific type
+# 
+#      skos:prefLabel = .[rdf:type is <http://example.org/MyType>]/rdfs:label; 
+# 
+# Praameters:
+# * append:[true/flase] (default=true) If the result of the LDpath program is
+#     appended to the processed entity or if the processed entity should be
+#     replaced with the results of the LDpath program
+# * ldpath:{file} (required, no default) The {file} containing the LDpath
+#     program used by this processor. {file} is relative to the config directory.
+# 
+# NOTEs:
+# * The LdpathProcessor has only access to the local properties of the currently 
+#   indexed entity. LDPath statements that refer other information such as paths 
+#   with a lengths > 1 or inverse properties will not work
+# * Processors can be chained by defining multiple Processor instances in the 
+#   configuration and separating them with ';'. This allows to use multiple 
+#   LdpathProcessor instances and/or to chain LdpathProcessor(s) with others 
+#   such as the "FiledMapperProcessor". Processors are executed as defined 
+#   within the configuration of the "entityProcessor" property. 
+# * When using the FiledMapperProcessor on results of the LdpathProcessor make 
+#   sure that the fields defined in the LDpath statements are indexed by the 
+#   FiledMapperProcessor. Otherwise such values will NOT be indexed!
+# org.apache.stanbol.entityhub.indexing.core.processor.LdpathProcessor,ldpath:ldpath-mapping.txt,append:true
 
-# to deactivate EntityProcessing one can use
+# EmptyProcessor
+#
 #entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.EmptyProcessor
+#
+# This processor can be used to deactivate EntityProcessing
+
+# Default Entity Processor configuration
+entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.FieldValueFilter,config:entityTypes;org.apache.stanbol.entityhub.indexing.core.processor.FiledMapperProcessor
 
 
 # Index Field Configuration

Modified: incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/EntityhubLDPath.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/EntityhubLDPath.java?rev=1306251&r1=1306250&r2=1306251&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/EntityhubLDPath.java (original)
+++ incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/EntityhubLDPath.java Wed Mar 28 11:02:23 2012
@@ -16,6 +16,8 @@
 */
 package org.apache.stanbol.entityhub.ldpath;
 
+import java.util.Collection;
+
 import org.apache.stanbol.entityhub.core.mapping.ValueConverterFactory.AnyUriConverter;
 import org.apache.stanbol.entityhub.core.mapping.ValueConverterFactory.ReferenceConverter;
 import org.apache.stanbol.entityhub.core.mapping.ValueConverterFactory.TextConverter;
@@ -111,7 +113,10 @@ public class EntityhubLDPath extends LDP
         }
         Representation result = vf.createRepresentation(context.getReference());
         for(FieldMapping<?,Object> mapping : program.getFields()) {
-            result.add(mapping.getFieldName(),mapping.getValues(backend,context));
+            Collection<?> values = mapping.getValues(backend,context);
+            if(values !=null && !values.isEmpty()){
+                result.add(mapping.getFieldName(),values);
+            }
         }
         return result;
         

Added: incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java?rev=1306251&view=auto
==============================================================================
--- incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java (added)
+++ incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java Wed Mar 28 11:02:23 2012
@@ -0,0 +1,70 @@
+package org.apache.stanbol.entityhub.ldpath.backend;
+
+import java.util.Collections;
+
+import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory;
+import org.apache.stanbol.entityhub.core.query.FieldQueryImpl;
+import org.apache.stanbol.entityhub.core.query.QueryResultListImpl;
+import org.apache.stanbol.entityhub.servicesapi.EntityhubException;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
+import org.apache.stanbol.entityhub.servicesapi.query.FieldQuery;
+import org.apache.stanbol.entityhub.servicesapi.query.QueryResultList;
+
+/**
+ * Allows to execute ldpath on the data of a single Representation. Will not
+ * support paths longer that <code>1</code> but might still be very usefull
+ * to select/filter specific values of fields.
+ * @author Rupert Westenthaler
+ *
+ */
+public class SingleRepresentationBackend extends AbstractBackend {
+
+    
+    Representation representation;
+    private final ValueFactory valueFactory;
+    
+    private static final FieldQuery DUMMY_QUERY = new FieldQueryImpl();
+    @SuppressWarnings("unchecked")
+    private static final QueryResultList<String> EMPTY_RESULT = 
+            new QueryResultListImpl<String>(DUMMY_QUERY, Collections.EMPTY_LIST, String.class);
+    
+    public SingleRepresentationBackend() {
+        this(null);
+    }
+    public SingleRepresentationBackend(ValueFactory vf){
+        if(vf == null){
+            this.valueFactory = InMemoryValueFactory.getInstance();
+        } else {
+            this.valueFactory = vf;
+        }
+    }
+    public void setRepresentation(Representation r){
+        if(r != null){
+            this.representation = r;
+        } else {
+            throw new IllegalArgumentException("The parsed Representation MUST NOT be NULL!");
+        }
+    }
+    
+    @Override
+    protected ValueFactory getValueFactory() {
+        return valueFactory;
+    }
+
+    @Override
+    protected Representation getRepresentation(String id) throws EntityhubException {
+        return representation.getId().equals(id) ? representation : null;
+    }
+
+    @Override
+    protected QueryResultList<String> query(FieldQuery query) throws EntityhubException {
+        return EMPTY_RESULT;
+    }
+
+    @Override
+    protected FieldQuery createQuery() {
+        return DUMMY_QUERY;
+    }
+
+}

Propchange: incubator/stanbol/trunk/entityhub/ldpath/src/main/java/org/apache/stanbol/entityhub/ldpath/backend/SingleRepresentationBackend.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/entityhub/ldpath/src/test/java/org/apache/stanbol/entityhub/ldpath/backend/BackendTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/ldpath/src/test/java/org/apache/stanbol/entityhub/ldpath/backend/BackendTest.java?rev=1306251&r1=1306250&r2=1306251&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/ldpath/src/test/java/org/apache/stanbol/entityhub/ldpath/backend/BackendTest.java (original)
+++ incubator/stanbol/trunk/entityhub/ldpath/src/test/java/org/apache/stanbol/entityhub/ldpath/backend/BackendTest.java Wed Mar 28 11:02:23 2012
@@ -18,6 +18,8 @@ package org.apache.stanbol.entityhub.ldp
 
 import static org.apache.stanbol.entityhub.ldpath.LDPathUtils.getReader;
 import static org.junit.Assert.assertNotNull;
+
+import java.io.StringReader;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@@ -25,6 +27,8 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import org.apache.stanbol.entityhub.ldpath.impl.LDPathTestBase;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+import org.junit.Assert;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -168,4 +172,30 @@ public class BackendTest extends LDPathT
         log.info("Assert LDPath Result for {}:", EXPECTED_HARVARD_ALUMNI);
         assertLDPathResult(result,EXPECTED_HARVARD_ALUMNI);
     }
+    @Test 
+    public void testSingleRepresentationBackend() throws Exception {
+        Representation paris = yard.getRepresentation("http://dbpedia.org/resource/Paris");
+        assertNotNull(paris);
+        SingleRepresentationBackend backend = new SingleRepresentationBackend();
+        backend.setRepresentation(paris);
+        LDPath<Object> ldPath = new LDPath<Object>(backend);
+        StringBuilder sb = new StringBuilder();
+        sb.append("myTest = .[rdf:type is <http://dbpedia.org/ontology/Place>]/rdfs:label;");
+        Program<Object> program = ldPath.parseProgram(new StringReader(sb.toString()));
+        Map<String,Collection<?>> result = program.execute(backend, yard.getValueFactory().createReference(paris.getId()));
+        Assert.assertNotNull(result);
+        Assert.assertTrue(result.containsKey("myTest"));
+        Collection<?> values = result.get("myTest");
+        Assert.assertNotNull(values);
+        Assert.assertFalse(values.isEmpty());
+
+        sb = new StringBuilder();
+        sb.append("myTest = .[rdf:type is <http://dbpedia.org/ontology/Place2>]/rdfs:label;");
+        program = ldPath.parseProgram(new StringReader(sb.toString()));
+        result = program.execute(backend, yard.getValueFactory().createReference(paris.getId()));
+        Assert.assertNotNull(result);
+        values = result.get("myTest");
+        Assert.assertTrue(values == null || values.isEmpty());
+
+    }
 }