You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/09/10 16:29:11 UTC
svn commit: r1624013 - in /stanbol/trunk:
enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/
enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/
...
Author: rwesten
Date: Wed Sep 10 14:29:11 2014
New Revision: 1624013
URL: http://svn.apache.org/r1624013
Log:
merged implementation for STANBOL-1391 from 0.12.1 to trunk
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Wed Sep 10 14:29:11 2014
@@ -87,6 +87,8 @@ public class FstLinkingEngine implements
private static final UriRef ENHANCER_ENTITY_RANKING = new UriRef(NamespaceEnum.fise + "entity-ranking");
+ public static final UriRef FISE_ORIGIN = new UriRef(NamespaceEnum.fise + "origin");
+
private final LiteralFactory literalFactory = LiteralFactory.getInstance();
protected final String name;
@@ -579,6 +581,10 @@ public class FstLinkingEngine implements
Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
//add the relation to the fise:TextAnnotation (the tag)
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
+ //write origin information
+ if(indexConfig.getOrigin() != null){
+ metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
+ }
//TODO: add origin information of the EntiySearcher
// for(Entry<UriRef,Collection<Resource>> originInfo : entitySearcher.getOriginInformation().entrySet()){
// for(Resource value : originInfo.getValue()){
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Wed Sep 10 14:29:11 2014
@@ -33,6 +33,8 @@ import static org.osgi.framework.Constan
import java.io.File;
import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Dictionary;
import java.util.HashMap;
@@ -44,6 +46,10 @@ import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
@@ -188,6 +194,16 @@ public class FstLinkingEngineComponent {
public static final String SOLR_CORE = "enhancer.engines.linking.lucenefst.solrcore";
/**
+ * The origin information for all Entities provided by the configured SolrCore and
+ * FST. Origin information are added to all <code>fise:EntityAnnotation</code>
+ * by using the <code>fise:origin</code> property. Configured values can be both
+ * {@link UriRef URI}s or {@link Literal}s. Configured Strings are checked if
+ * they are valid {@link URI}s and {@link URI#isAbsolute() absolute}. If not
+ * a {@link Literal} is parsed.
+ */
+ public static final String ORIGIN = "enhancer.engines.linking.lucenefst.origin";
+
+ /**
* The size of the thread pool used to create FST models (default=1). Creating
* such models does need a lot of memory. Expect values up to 10times of the
* build model. So while this task can easily performed concurrently users need
@@ -231,6 +247,11 @@ public class FstLinkingEngineComponent {
private String engineName;
/**
+ * The origin information of Entities.
+ */
+ private Resource origin;
+
+ /**
* used to resolve '{prefix}:{local-name}' used within the engines configuration
*/
@Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY)
@@ -391,7 +412,29 @@ public class FstLinkingEngineComponent {
skipAltTokensConfig = Boolean.valueOf(value.toString());
} // else no config -> will use the default
- //(4) init the FST configuration
+ //(4) parse Origin information
+ value = properties.get(ORIGIN);
+ if(value instanceof Resource){
+ origin = (Resource)origin;
+ } else if (value instanceof String){
+ try {
+ URI originUri = new URI((String)value);
+ if(originUri.isAbsolute()){
+ origin = new UriRef((String)value);
+ } else {
+ origin = new PlainLiteralImpl((String)value);
+ }
+ } catch(URISyntaxException e){
+ origin = new PlainLiteralImpl((String)value);
+ }
+ log.info(" - origin: {}", origin);
+ } else if(value != null){
+ log.warn("Values of the {} property MUST BE of type Resource or String "
+ + "(parsed: {} (type:{}))", new Object[]{ORIGIN,value,value.getClass()});
+ } //else no ORIGIN information provided
+
+
+ //(5) init the FST configuration
//We can create the default configuration only here, as it depends on the
//name of the solrIndex
String defaultConfig = "*;"
@@ -417,7 +460,7 @@ public class FstLinkingEngineComponent {
+ "(found: "+value.getClass().getName()+")!");
}
- //(5) Create the ThreadPool used for the runtime creation of FST models
+ //(6) Create the ThreadPool used for the runtime creation of FST models
value = properties.get(FST_THREAD_POOL_SIZE);
int tpSize;
if(value instanceof Number){
@@ -457,7 +500,7 @@ public class FstLinkingEngineComponent {
}
fstCreatorService = Executors.newFixedThreadPool(tpSize,tfBuilder.build());
- //(6) Parse the EntityCache config
+ //(7) Parse the EntityCache config
int entityCacheSize;
value = properties.get(ENTITY_CACHE_SIZE);
if(value instanceof Number){
@@ -482,14 +525,14 @@ public class FstLinkingEngineComponent {
log.info(" ... EntityCache enabled (size: {})",this.entityCacheSize);
}
- //(7) parse the Entity type field
+ //(8) parse the Entity type field
value = properties.get(IndexConfiguration.SOLR_TYPE_FIELD);
if(value == null || StringUtils.isBlank(value.toString())){
solrTypeField = null;
} else {
solrTypeField = value.toString().trim();
}
- //(8) parse the Entity Ranking field
+ //(9) parse the Entity Ranking field
value = properties.get(IndexConfiguration.SOLR_RANKING_FIELD);
if(value == null){
solrRankingField = null;
@@ -497,7 +540,7 @@ public class FstLinkingEngineComponent {
solrRankingField = value.toString().trim();
}
- //(9) start tracking the SolrCore
+ //(10) start tracking the SolrCore
try {
solrServerTracker = new RegisteredSolrServerTracker(
bundleContext, indexReference, null){
@@ -588,6 +631,7 @@ public class FstLinkingEngineComponent {
//set fields parsed in the activate method
indexConfig.setExecutorService(fstCreatorService);
indexConfig.setRedirectField(null);//TODO add support
+ indexConfig.setOrigin(origin);
//NOTE: the FST cofnig is processed even if the SolrCore has not changed
// because their might be config changes and/or new FST files in the
// FST directory of the SolrCore.
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java Wed Sep 10 14:29:11 2014
@@ -27,6 +27,9 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.WildcardFileFilter;
@@ -105,6 +108,15 @@ public class IndexConfiguration {
private boolean active = false;
private File fstDirectory;
+
+ /**
+ * The origin is added to <code>fise:TextAnnotation</code> created for
+ * linked Entities. It is intended to be used for providing a reference to
+ * dataset of the Entity. Both {@link UriRef URI}s and {@link Literal}s can
+ * be used here
+ */
+ private Resource origin;
+
/**
* If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
* tokens should cause an {@link UnsupportedTokenException}.
@@ -363,6 +375,22 @@ public class IndexConfiguration {
this.fstDirectory = fstDirectory;
}
+ public void setOrigin(Resource origin) {
+ this.origin = origin;
+ }
+ /**
+ * The Origin of the dataset or <code>null</code> if not defined. The
+ * origin can be used to specify the dataset where the Entities described by
+ * the configured FST originate from. If can be both an URI (e.g.
+ * <code>http://dbpedia.org</code>) or an literal "<code>dbpedia</code>").
+ * If present the origin is added to any <code>fise:TextAnnotation</code>
+ * created by the FstLinkingEngine with the property <code>fise:origin</code>
+ *
+ * @return the origin or <code>null</code> if none is configured
+ */
+ public Resource getOrigin() {
+ return origin;
+ }
/**
* Deactivates this {@link IndexConfiguration}
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java Wed Sep 10 14:29:11 2014
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESSED_LANGUAGES;
import static org.apache.stanbol.enhancer.engines.entitylinking.config.TextProcessingConfig.PROCESS_ONLY_PROPER_NOUNS_STATE;
+import static org.apache.stanbol.enhancer.engines.lucenefstlinking.FstLinkingEngine.FISE_ORIGIN;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_EXTRACTED_FROM;
@@ -122,6 +123,7 @@ public class FstLinkingEngineTest {
*/
public static final String TEST_YARD_ID = "dbpedia";
public static final String TEST_SOLR_CORE_NAME = "dbpedia";
+ public static final String TEST_ORIGIN = "texst.origin";
public static final String TEST_SOLR_CORE_CONFIGURATION = "dbpedia_26k.solrindex.bz2";
protected static final String TEST_INDEX_REL_PATH = File.separatorChar + "target" + File.separatorChar
+ ManagedSolrServer.DEFAULT_SOLR_DATA_DIR;
@@ -199,6 +201,7 @@ public class FstLinkingEngineTest {
fstConfig.setTypeField("rdf:type");
fstConfig.setRankingField("entityhub:entityRank");
//fstConfig.setEntityCacheManager(new FastLRUCacheManager(2048));
+ fstConfig.setOrigin(new PlainLiteralImpl(TEST_ORIGIN));
//activate the FST config
fstConfig.activate(); //activate this configuration
@@ -384,6 +387,10 @@ public class FstLinkingEngineTest {
if(suggestedEntities.remove(entityUri.getUnicodeString())){
log.info(" ... found");
}
+ //assert origin
+ assertEquals(TEST_ORIGIN, EnhancementEngineHelper.getString(
+ ci.getMetadata(),entityAnnotation, FISE_ORIGIN));
+
// Assert.assertTrue("fise:referenced-entity " + entityUri +
// " not expected (expected: "+expectedEntities+")",
// suggestedEntities.remove(entityUri.getUnicodeString()) ||
Modified: stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java?rev=1624013&r1=1624012&r2=1624013&view=diff
==============================================================================
--- stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java (original)
+++ stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java Wed Sep 10 14:29:11 2014
@@ -222,6 +222,16 @@ public final class Properties {
NamespaceEnum.fise + "confidence-level");
/**
+ * The origin can be used to reference the vocabulary (dataset, thesaurus,
+ * ontology, ...) the Entity {@link #ENHANCER_ENTITY_REFERENCE referenced}
+ * by a <code>{@link TechnicalClasses#ENHANCER_ENTITYANNOTATION fise:EntiyAnnotation}</code>
+ * originates from.
+ * @since 0.12.1 (STANBOL-????)
+ */
+ public static final UriRef ENHANCER_ORIGIN = new UriRef(
+ NamespaceEnum.fise + "origin");
+
+ /**
* Internet Media Type of a content item.
*
* @deprecated dc:FileFormat does not exist