You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/24 15:48:54 UTC
svn commit: r1376912 [1/2] - in
/incubator/stanbol/branches/dbpedia-spotlight-engines: ./
bundlelist/src/main/bundles/ engines/ engines/dbpedia-spotlight-annotate/
engines/dbpedia-spotlight-candidates/
engines/dbpedia-spotlight-disambiguate/ engines/db...
Author: rwesten
Date: Fri Aug 24 13:48:52 2012
New Revision: 1376912
URL: http://svn.apache.org/viewvc?rev=1376912&view=rev
Log:
STANBOL-706: Moved all DBpedia Spotlight Engines to a single module. Moved shared functionality to a utility class. Moved shared constants to a common interface; Also applied changes to the Disambiguation engine similar as for the others
Added:
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/pom.xml
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/pom.xml
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/Constants.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/Annotation.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/CandidateResource.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/SurfaceForm.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/SpotlightEngineUtils.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/XMLParser.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpspotlight.config
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementTest.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/
- copied from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java
- copied, changed from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/resources/README
- copied unchanged from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/resources/README
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/resources/spots.xml
- copied unchanged from r1376397, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/resources/spots.xml
Removed:
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/Annotation.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/XMLParser.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/CandidateResource.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/SurfaceForm.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/XMLParser.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/Annotation.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/core/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/core/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightannotate/
Modified:
incubator/stanbol/branches/dbpedia-spotlight-engines/ (props changed)
incubator/stanbol/branches/dbpedia-spotlight-engines/bundlelist/src/main/bundles/list.xml
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml
incubator/stanbol/branches/dbpedia-spotlight-engines/generic/test/src/main/java/org/apache/stanbol/enhancer/test/helper/EnhancementStructureHelper.java
Propchange: incubator/stanbol/branches/dbpedia-spotlight-engines/
------------------------------------------------------------------------------
svn:mergeinfo = /incubator/stanbol/trunk/enhancer:1376046,1376385
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/bundlelist/src/main/bundles/list.xml?rev=1376912&r1=1376911&r2=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/bundlelist/src/main/bundles/list.xml (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/bundlelist/src/main/bundles/list.xml Fri Aug 24 13:48:52 2012
@@ -201,20 +201,9 @@
<!-- DBpedia Spotlight Engines (STANBOL-706) -->
<bundle>
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.spot</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight</artifactId>
<version>0.10.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.candidates</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
- </bundle>
- <bundle>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.annotate</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
- </bundle>
-
</startLevel>
<!-- Default Configuration for the Stanbol Enhancer -->
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/pom.xml (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/pom.xml)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/pom.xml?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/pom.xml&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/pom.xml&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/pom.xml (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/pom.xml Fri Aug 24 13:48:52 2012
@@ -22,12 +22,22 @@
</parent>
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.annotate</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight</artifactId>
<version>0.10.0-incubating-SNAPSHOT</version>
<packaging>bundle</packaging>
- <name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight Annotate</name>
- <description></description>
+ <name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight</name>
+ <description>
+ This module provides four Enhancement Engines for the
+ DBpedia Spotlight RESTful services. This includes the Annotate Engine
+ - supporting the whole processing workflow as well as a Spotting,
+ Candidate and Disambiguation Engine that can be used by Users that whant
+ only to use part of DBpedia Spotlights functionalities within their
+ own Enhancement Engines.
+ Users that do not want to send their Content to the public Spotlight
+ server can also install a local Spotlight server and change the
+ Configuration of the Engines accordingly.
+ </description>
<inceptionYear>2012</inceptionYear>
@@ -54,6 +64,8 @@
</Export-Package>
<Embed-Dependency>
</Embed-Dependency>
+ <!-- configure a dbpedia chain -->
+ <Install-Path>config</Install-Path>
</instructions>
</configuration>
</plugin>
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/Constants.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/Constants.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/Constants.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/Constants.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,73 @@
+package org.apache.stanbol.enhancer.engines.dbpspotlight;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.felix.scr.annotations.Property;
+
+/**
+ * Defines Properties used for the configuration of the different Engines
+ */
+public interface Constants {
+
+ String PARAM_URL_KEY = "dbpedia.spotlight.url";
+
+ String PARAM_SPOTTER = "dbpedia.spotlight.spotter";
+
+ String PARAM_DISAMBIGUATOR = "dbpedia.spotlight.disambiguator";
+
+ String PARAM_RESTRICTION = "dbpedia.spotlight.types";
+
+ String PARAM_SPARQL = "dbpedia.spotlight.sparql";
+
+ String PARAM_SUPPORT = "dbpedia.spotlight.support";
+
+ String PARAM_CONFIDENCE = "dbpedia.spotlight.confidence";
+
+
+ /**
+ * The namespace used by DBpedia Spotlight specific properties
+ */
+ String SPOTLIGHT_NAME_SPACE = "http://spotlight.dbpedia.org/ns/";
+
+ /*
+ * Definition of some Spotlight specific properties added to
+ * fise:EntityAnnotations created by this Engine
+ */
+ UriRef PROPERTY_CONTEXTUAL_SCORE = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "contextualScore");
+ UriRef PROPERTY_PERCENTAGE_OF_SECOND_RANK = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "percentageOfSecondRank");
+ UriRef PROPERTY_SUPPORT = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "support");
+ UriRef PROPERTY_PRIOR_SCORE = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "priorScore");
+ UriRef PROPERTY_FINAL_SCORE = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "finalScore");
+ UriRef PROPERTY_SIMILARITY_SCORE = new UriRef(
+ SPOTLIGHT_NAME_SPACE + "similarityScore");
+
+ Charset UTF8 = Charset.forName("UTF-8");
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * This contains a list of languages supported by DBpedia Spotlight. If the
+ * metadata doesn't contain a value for the language as the value of the
+ * {@link Property.DC_LANG property} the content can't be processed.
+ */
+ Set<String> SUPPORTED_LANGUAGES = Collections
+ .unmodifiableSet(new HashSet<String>(Arrays.asList("en")));
+
+
+ /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */
+ Set<String> SUPPORTED_MIMTYPES = Collections
+ .singleton(TEXT_PLAIN_MIMETYPE);
+
+}
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementEngine.java Fri Aug 24 13:48:52 2012
@@ -16,18 +16,15 @@
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.annotate;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.annotate.XMLParser.getElementsByTagName;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.annotate.XMLParser.loadXMLFromInputStream;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_CONFIDENCE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_DISAMBIGUATOR;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_RESTRICTION;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPARQL;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPOTTER;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SUPPORT;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_URL_KEY;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.UTF8;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.loadXMLFromInputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
@@ -36,32 +33,15 @@ import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
-import java.nio.charset.Charset;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
-import org.apache.clerezza.rdf.core.Literal;
-import org.apache.clerezza.rdf.core.LiteralFactory;
-import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.NonLiteral;
-import org.apache.clerezza.rdf.core.Resource;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
-import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.serializedform.Serializer;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
@@ -71,22 +51,18 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.utils.SpotlightEngineUtils;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
@@ -95,39 +71,24 @@ import org.xml.sax.SAXException;
*
* @author Iavor Jelev, Babelmonkeys (GzEvD)
*/
-@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightAnnotateEnhancementEngine.name", description = "%stanbol.DBPSpotlightAnnotateEnhancementEngine.description")
+@Component(metatype = true, immediate = true,
+ label = "%stanbol.DBPSpotlightAnnotateEnhancementEngine.name",
+ description = "%stanbol.DBPSpotlightAnnotateEnhancementEngine.description")
@Service
-@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightannotate") })
+@Properties(value = {
+ @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightannotate"),
+ @Property(name = PARAM_URL_KEY, value = "http://spotlight.dbpedia.org/rest/annotate"),
+ @Property(name = PARAM_SPOTTER),
+ @Property(name = PARAM_DISAMBIGUATOR),
+ @Property(name = PARAM_RESTRICTION),
+ @Property(name = PARAM_SPARQL),
+ @Property(name = PARAM_SUPPORT),
+ @Property(name = PARAM_CONFIDENCE)
+})
public class DBPSpotlightAnnotateEnhancementEngine extends
AbstractEnhancementEngine<IOException, RuntimeException> implements
EnhancementEngine, ServiceProperties {
- private static final Charset UTF8 = Charset.forName("UTF-8");
-
- /**
- * a configurable value of the text segment length to check
- */
- @Property(value = "http://spotlight.dbpedia.org/rest/annotate")
- public static final String SL_URL_KEY = "stanbol.DBPSpotlightAnnotateEnhancementEngine.url";
-
- @Property(value = "NESpotter")
- public static final String SL_SPOTTER = "stanbol.DBPSpotlightAnnotateEnhancementEngine.spotter";
-
- @Property(value = "")
- public static final String SL_DISAMBIGUATOR = "stanbol.DBPSpotlightAnnotateEnhancementEngine.disambiguator";
-
- @Property()
- public static final String SL_RESTRICTION = "stanbol.DBPSpotlightAnnotateEnhancementEngine.types";
-
- @Property()
- public static final String SL_SPARQL = "stanbol.DBPSpotlightAnnotateEnhancementEngine.sparql";
-
- @Property()
- public static final String SL_SUPPORT = "stanbol.DBPSpotlightAnnotateEnhancementEngine.support";
-
- @Property()
- public static final String SL_CONFIDENCE = "stanbol.DBPSpotlightAnnotateEnhancementEngine.confidence";
-
/**
* Ensures this engine is deactivated in {@link OfflineMode}
*/
@@ -140,22 +101,6 @@ public class DBPSpotlightAnnotateEnhance
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 27;
- /**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
- */
- private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
- /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */
- private static final Set<String> SUPPORTED_MIMTYPES = Collections
- .singleton(TEXT_PLAIN_MIMETYPE);
- /**
- * This contains a list of languages supported by DBpedia Spotlight. If the
- * metadata doesn't contain a value for the language as the value of the
- * {@link Property.DC_LANG property} the content can't be processed.
- */
- protected static final Set<String> SUPPORTED_LANGUAGES = Collections
- .unmodifiableSet(new HashSet<String>(Arrays.asList("en")));
-
/** holds the logger. */
private static final Logger log = LoggerFactory
.getLogger(DBPSpotlightAnnotateEnhancementEngine.class);
@@ -203,34 +148,23 @@ public class DBPSpotlightAnnotateEnhance
super.activate(ce);
Dictionary<String, Object> properties = ce.getProperties();
- Object value = properties.get(SL_URL_KEY);
- if(value == null || value.toString().isEmpty()){
- throw new ConfigurationException(SL_URL_KEY, "The URL with the DBpedia "
- + "Spotlight Annotate RESTful Service MUST NOT be NULL nor empty!");
- } else {
- String url = (String) properties.get(SL_URL_KEY);
- try {
- this.spotlightUrl = new URL(url);
- } catch (MalformedURLException e) {
- throw new ConfigurationException(SL_URL_KEY, "The parsed URL for the "
- + "DBpedia Spotlight Annotate RESTful Service is illegal formatted!",
- e);
- }
- }
- spotlightSpotter = properties.get(SL_SPOTTER) == null ? null
- : (String) properties.get(SL_SPOTTER);
- spotlightDisambiguator = properties.get(SL_DISAMBIGUATOR) == null ? null
- : (String) properties.get(SL_DISAMBIGUATOR);
- spotlightTypesRestriction = properties.get(SL_RESTRICTION) == null ? null
- : (String) properties.get(SL_RESTRICTION);
- spotlightSparql = properties.get(SL_SPARQL) == null ? null
- : (String) properties.get(SL_SPARQL);
- spotlightSupport = properties.get(SL_SUPPORT) == null ? null
- : (String) properties.get(SL_SUPPORT);
- spotlightConfidence = properties.get(SL_CONFIDENCE) == null ? null
- : (String) properties.get(SL_CONFIDENCE);
+ spotlightUrl = SpotlightEngineUtils.parseSpotlightServiceURL(properties);
+ spotlightSpotter = properties.get(PARAM_SPOTTER) == null ? null
+ : (String) properties.get(PARAM_SPOTTER);
+ spotlightDisambiguator = properties.get(PARAM_DISAMBIGUATOR) == null ? null
+ : (String) properties.get(PARAM_DISAMBIGUATOR);
+ spotlightTypesRestriction = properties.get(PARAM_RESTRICTION) == null ? null
+ : (String) properties.get(PARAM_RESTRICTION);
+ spotlightSparql = properties.get(PARAM_SPARQL) == null ? null
+ : (String) properties.get(PARAM_SPARQL);
+ spotlightSupport = properties.get(PARAM_SUPPORT) == null ? null
+ : (String) properties.get(PARAM_SUPPORT);
+ spotlightConfidence = properties.get(PARAM_CONFIDENCE) == null ? null
+ : (String) properties.get(PARAM_CONFIDENCE);
}
+
+
/**
* Check if the content can be enhanced
*
@@ -238,18 +172,8 @@ public class DBPSpotlightAnnotateEnhance
* the {@link ContentItem}
*/
public int canEnhance(ContentItem ci) throws EngineException {
- if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
- String language = EnhancementEngineHelper.getLanguage(ci);
- if (!SUPPORTED_LANGUAGES.contains(language)) {
- log.info("DBpedia Spotlight can not process ContentItem {} "
- + "because language {} is not supported (supported: {})",
- new Object[] { ci.getUri(), language, SUPPORTED_LANGUAGES });
- return CANNOT_ENHANCE;
- }
- return ENHANCE_ASYNC;
- } else {
- return CANNOT_ENHANCE;
- }
+ return SpotlightEngineUtils.canProcess(ci) ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
}
/**
@@ -260,36 +184,8 @@ public class DBPSpotlightAnnotateEnhance
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
- Language language;
- String lang = EnhancementEngineHelper.getLanguage(ci);
- if(!SUPPORTED_LANGUAGES.contains(lang)){
- throw new IllegalStateException("Langage '"+lang
- + "' as annotated for ContentItem "
- + ci.getUri() + " is not supported by this Engine: "
- + "This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- } else {
- language = lang == null || lang.isEmpty() ? null : new Language(lang);
- }
- Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
- SUPPORTED_MIMTYPES);
- if (contentPart == null) {
- throw new IllegalStateException(
- "No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE
- + "' found for ContentItem "
- + ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text;
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
+ Language language = SpotlightEngineUtils.getContentLanguage(ci);
+ String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<Annotation> dbpslGraph = doPostRequest(text,ci.getUri());
if (dbpslGraph != null) {
@@ -316,6 +212,8 @@ public class DBPSpotlightAnnotateEnhance
}
}
+
+
/**
* This generates enhancement structures for the entities from DBPedia
* Spotlight and adds them to the content item's metadata. For each entity a
@@ -329,62 +227,22 @@ public class DBPSpotlightAnnotateEnhance
*/
protected void createEnhancements(Collection<Annotation> occs,
ContentItem ci, String text, Language language) {
- LiteralFactory literalFactory = LiteralFactory.getInstance();
-
- HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource, UriRef>();
-
+ //we need to create multiple EntityAnnotations even for the same
+ //suggested Entity, as the scores will be different
+ //HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource, UriRef>();
for (Annotation occ : occs) {
- UriRef textAnnotation = EnhancementEngineHelper
- .createTextEnhancement(ci, this);
- MGraph model = ci.getMetadata();
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
- new PlainLiteralImpl(occ.surfaceForm, language)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_START,
- literalFactory.createTypedLiteral(occ.offset)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_END,
- literalFactory.createTypedLiteral(occ.offset
- + occ.surfaceForm.length())));
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
- new PlainLiteralImpl(
- getSelectionContext(text, occ.surfaceForm, occ.offset),
- language)));
- if (entityAnnotationMap.containsKey(occ.uri)) {
- model.add(new TripleImpl(entityAnnotationMap.get(occ.uri),
- DC_RELATION, textAnnotation));
- } else {
- UriRef entityAnnotation = EnhancementEngineHelper
- .createEntityEnhancement(ci, this);
- log.info(" annotation: {} {}",occ.uri,occ.surfaceForm);
- entityAnnotationMap.put(occ.uri, entityAnnotation);
- Literal label = new PlainLiteralImpl(occ.surfaceForm,
- new Language("en"));
- model.add(new TripleImpl(entityAnnotation, DC_RELATION,
- textAnnotation));
- model.add(new TripleImpl(entityAnnotation,
- ENHANCER_ENTITY_LABEL, label));
- model.add(new TripleImpl(entityAnnotation,
- ENHANCER_ENTITY_REFERENCE, occ.uri));
- //set the fise:entity-type
- for(String type : occ.getTypeNames()){
- UriRef annotationType = new UriRef(type);
- log.info(" > {}",annotationType);
- model.add(new TripleImpl(entityAnnotation,
- ENHANCER_ENTITY_TYPE, annotationType));
- }
- //set the dc:type of the fise:TextAnnotation if not yet done
- List<String> dbpTypes = occ.getDbpediaTypeNames();
- if(!dbpTypes.isEmpty() && !model.filter(textAnnotation, DC_TYPE, null).hasNext()){
- //use the last of the dbpedia ontology type as they
- //are sorted from the most specific to the most
- //common one - the dc:type should be a common one
- UriRef dcType = new UriRef(dbpTypes.get(dbpTypes.size()-1));
- log.info(" dcType={}",dcType);
- model.add(new TripleImpl(textAnnotation, DC_TYPE,
- dcType));
- }
+ UriRef textAnnotation = SpotlightEngineUtils.createTextEnhancement(
+ occ.surfaceForm, this, ci, text, language);
+
+// if (entityAnnotationMap.containsKey(occ.uri)) {
+// model.add(new TripleImpl(entityAnnotationMap.get(occ.uri),
+// DC_RELATION, textAnnotation));
+// } else {
+ SpotlightEngineUtils.createEntityAnnotation(occ, this, ci, textAnnotation, language);
+// entityAnnotationMap.put(occ.uri, entityAnnotation);
}
}
- }
+
/**
* Sends a POST request to the DBpediaSpotlight endpoint.
@@ -484,80 +342,13 @@ public class DBPSpotlightAnnotateEnhance
} finally {
IOUtils.closeQuietly(is);
}
- NodeList nlist = getElementsByTagName(xmlDoc, "Resource");
- return getAnnotations(nlist);
+ return Annotation.parseAnnotations(xmlDoc);
}
- /**
- * This method creates the Collection of Annotations, which the method
- * <code>createEnhancement</code> adds to the meta data of the content item.
- *
- * @param nList
- * NodeList of all Resources contained in the XML response from
- * DBpedia Spotlight
- * @return a Collection<DBPSLAnnotation> with all annotations
- */
- private Collection<Annotation> getAnnotations(NodeList nList) {
- Collection<Annotation> dbpslAnnos = new HashSet<Annotation>();
- for (int temp = 0; temp < nList.getLength(); temp++) {
- Annotation dbpslann = new Annotation();
- Element node = (Element) nList.item(temp);
- dbpslann.uri = new UriRef(node.getAttribute("URI"));
- dbpslann.support = (new Integer(node.getAttribute("support")))
- .intValue();
- dbpslann.types = node.getAttribute("types");
- dbpslann.surfaceForm = node.getAttribute("surfaceForm");
- dbpslann.offset = (new Integer(node.getAttribute("offset")))
- .intValue();
- dbpslann.similarityScore = (new Double(
- node.getAttribute("similarityScore"))).doubleValue();
- dbpslann.percentageOfSecondRank = (new Double(
- node.getAttribute("percentageOfSecondRank"))).doubleValue();
-
- dbpslAnnos.add(dbpslann);
- }
-
- return dbpslAnnos;
- }
public Map<String, Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(
ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
}
-
- private static final int DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
- /**
- * Extracts the selection context based on the content, selection and
- * the start char offset of the selection
- * @param content the content
- * @param selection the selected text
- * @param selectionStartPos the start char position of the selection
- * @return the context
- */
- protected static String getSelectionContext(String content, String selection,int selectionStartPos){
- //extract the selection context
- int beginPos;
- if(selectionStartPos <= DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
- beginPos = 0;
- } else {
- int start = selectionStartPos-DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- beginPos = content.indexOf(' ',start);
- if(beginPos < 0 || beginPos >= selectionStartPos){ //no words
- beginPos = start; //begin within a word
- }
- }
- int endPos;
- if(selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= content.length()){
- endPos = content.length();
- } else {
- int start = selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- endPos = content.lastIndexOf(' ', start);
- if(endPos <= selectionStartPos+selection.length()){
- endPos = start; //end within a word;
- }
- }
- return content.substring(beginPos, endPos);
- }
-
}
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementEngine.java Fri Aug 24 13:48:52 2012
@@ -16,51 +16,36 @@
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.candidates;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.candidates.XMLParser.getElementsByTagName;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.candidates.XMLParser.loadXMLFromInputStream;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_CONFIDENCE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_DISAMBIGUATOR;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_RESTRICTION;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPARQL;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPOTTER;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SUPPORT;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_URL_KEY;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.UTF8;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.loadXMLFromInputStream;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
-import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
-import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
-import java.nio.charset.Charset;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
-import org.apache.clerezza.rdf.core.Literal;
-import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.NonLiteral;
-import org.apache.clerezza.rdf.core.Resource;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.serializedform.Serializer;
import org.apache.commons.io.IOUtils;
@@ -71,23 +56,19 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.CandidateResource;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.utils.SpotlightEngineUtils;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
@@ -98,33 +79,20 @@ import org.xml.sax.SAXException;
*/
@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightCandidatesEnhancementEngine.name", description = "%stanbol.DBPSpotlightCandidatesEnhancementEngine.description")
@Service
-@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightcandidates") })
+@Properties(value = {
+ @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightcandidates"),
+ @Property(name = PARAM_URL_KEY, value = "http://spotlight.dbpedia.org/rest/candidates"),
+ @Property(name = PARAM_SPOTTER),
+ @Property(name = PARAM_DISAMBIGUATOR),
+ @Property(name = PARAM_RESTRICTION),
+ @Property(name = PARAM_SPARQL),
+ @Property(name = PARAM_SUPPORT),
+ @Property(name = PARAM_CONFIDENCE)
+})
public class DBPSpotlightCandidatesEnhancementEngine extends
AbstractEnhancementEngine<IOException, RuntimeException> implements
EnhancementEngine, ServiceProperties {
- /** a configurable value of the text segment length to check */
- @Property(value = "http://spotlight.dbpedia.org/rest/candidates")
- public static final String SL_URL_KEY = "stanbol.DBPSpotlightCandidatesEnhancementEngine.url";
-
- @Property(value = "LingPipeSpotter")
- public static final String SL_SPOTTER = "stanbol.DBPSpotlightCandidatesEnhancementEngine.spotter";
-
- @Property(value = "")
- public static final String SL_DISAMBIGUATOR = "stanbol.DBPSpotlightCandidatesEnhancementEngine.disambiguator";
-
- @Property()
- public static final String SL_RESTRICTION = "stanbol.DBPSpotlightCandidatesEnhancementEngine.types";
-
- @Property()
- public static final String SL_SPARQL = "stanbol.DBPSpotlightCandidatesEnhancementEngine.sparql";
-
- @Property()
- public static final String SL_SUPPORT = "stanbol.DBPSpotlightCandidatesEnhancementEngine.support";
-
- @Property()
- public static final String SL_CONFIDENCE = "stanbol.DBPSpotlightCandidatesEnhancementEngine.confidence";
-
/**
* Ensures this engine is deactivated in {@link OfflineMode}
*/
@@ -138,43 +106,6 @@ public class DBPSpotlightCandidatesEnhan
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 35;
- private static final Charset UTF8 = Charset.forName("UTF-8");
- /**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
- */
- private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
- /**
- * This contains a list of languages supported by DBpedia Spotlight. If the
- * metadata doesn't contain a value for the language as the value of the
- * {@link Property.DC_LANG property} the content can't be processed.
- */
- protected static final Set<String> SUPPORTED_LANGUAGES = Collections
- .unmodifiableSet(new HashSet<String>(Arrays.asList("en")));
- /**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
- */
- private static final String SPOTLIGHT_NAME_SPACE = "http://spotlight.dbpedia.org/ns/";
-
- /*
- * Definition of some Spotlight specific properties added to
- * fise:EntityAnnotations created by this Engine
- */
- public static final UriRef SPOTLIGHT_CONTEXTUAL_SCORE = new UriRef(
- SPOTLIGHT_NAME_SPACE + "contextualScore");
- public static final UriRef SPOTLIGHT_PERCENTAGE_OF_SECOND_RANK = new UriRef(
- SPOTLIGHT_NAME_SPACE + "percentageOfSecondRank");
- public static final UriRef SPOTLIGHT_SUPPORT = new UriRef(
- SPOTLIGHT_NAME_SPACE + "support");
- public static final UriRef SPOTLIGHT_PRIOR_SCORE = new UriRef(
- SPOTLIGHT_NAME_SPACE + "priorScore");
- public static final UriRef SPOTLIGHT_FINAL_SCORE = new UriRef(
- SPOTLIGHT_NAME_SPACE + "finalScore");
-
- /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */
- private static final Set<String> SUPPORTED_MIMTYPES = Collections
- .singleton(TEXT_PLAIN_MIMETYPE);
/** This contains the logger. */
private static final Logger log = LoggerFactory
@@ -224,32 +155,19 @@ public class DBPSpotlightCandidatesEnhan
// TODO initialize Extractor
Dictionary<String, Object> properties = ce.getProperties();
//parse the URL of the RESTful service
- Object value = properties.get(SL_URL_KEY);
- if(value == null || value.toString().isEmpty()){
- throw new ConfigurationException(SL_URL_KEY, "The URL with the DBpedia "
- + "Spotlight Spot RESTful Service MUST NOT be NULL nor empty!");
- } else {
- String url = (String) properties.get(SL_URL_KEY);
- try {
- this.spotlightUrl = new URL(url);
- } catch (MalformedURLException e) {
- throw new ConfigurationException(SL_URL_KEY, "The parsed URL for the "
- + "DBpedia Spotlight Spot RESTful Service is illegal formatted!",
- e);
- }
- }
- spotlightSpotter = properties.get(SL_SPOTTER) == null ? null
- : (String) properties.get(SL_SPOTTER);
- spotlightDisambiguator = properties.get(SL_DISAMBIGUATOR) == null ? null
- : (String) properties.get(SL_DISAMBIGUATOR);
- spotlightTypesRestriction = properties.get(SL_RESTRICTION) == null ? null
- : (String) properties.get(SL_RESTRICTION);
- spotlightSparql = properties.get(SL_SPARQL) == null ? null
- : (String) properties.get(SL_SPARQL);
- spotlightSupport = properties.get(SL_SUPPORT) == null ? null
- : (String) properties.get(SL_SUPPORT);
- spotlightConfidence = properties.get(SL_CONFIDENCE) == null ? null
- : (String) properties.get(SL_CONFIDENCE);
+ spotlightUrl = SpotlightEngineUtils.parseSpotlightServiceURL(properties);
+ spotlightSpotter = properties.get(PARAM_SPOTTER) == null ? null
+ : (String) properties.get(PARAM_SPOTTER);
+ spotlightDisambiguator = properties.get(PARAM_DISAMBIGUATOR) == null ? null
+ : (String) properties.get(PARAM_DISAMBIGUATOR);
+ spotlightTypesRestriction = properties.get(PARAM_RESTRICTION) == null ? null
+ : (String) properties.get(PARAM_RESTRICTION);
+ spotlightSparql = properties.get(PARAM_SPARQL) == null ? null
+ : (String) properties.get(PARAM_SPARQL);
+ spotlightSupport = properties.get(PARAM_SUPPORT) == null ? null
+ : (String) properties.get(PARAM_SUPPORT);
+ spotlightConfidence = properties.get(PARAM_CONFIDENCE) == null ? null
+ : (String) properties.get(PARAM_CONFIDENCE);
}
/**
@@ -259,18 +177,8 @@ public class DBPSpotlightCandidatesEnhan
* the {@link ContentItem}
*/
public int canEnhance(ContentItem ci) throws EngineException {
- if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
- String language = EnhancementEngineHelper.getLanguage(ci);
- if (!SUPPORTED_LANGUAGES.contains(language)) {
- log.info("DBpedia Spotlight can not process ContentItem {} "
- + "because language {} is not supported (supported: {})",
- new Object[] { ci.getUri(), language, SUPPORTED_LANGUAGES });
- return CANNOT_ENHANCE;
- }
- return ENHANCE_ASYNC;
- } else {
- return CANNOT_ENHANCE;
- }
+ return SpotlightEngineUtils.canProcess(ci) ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
}
/**
@@ -281,36 +189,8 @@ public class DBPSpotlightCandidatesEnhan
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
- Language language;
- String lang = EnhancementEngineHelper.getLanguage(ci);
- if(!SUPPORTED_LANGUAGES.contains(lang)){
- throw new IllegalStateException("Langage '"+lang
- + "' as annotated for ContentItem "
- + ci.getUri() + " is not supported by this Engine: "
- + "This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- } else {
- language = lang == null || lang.isEmpty() ? null : new Language(lang);
- }
- Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
- SUPPORTED_MIMTYPES);
- if (contentPart == null) {
- throw new IllegalStateException(
- "No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE
- + "' found for ContentItem "
- + ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text;
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
+ Language language = SpotlightEngineUtils.getContentLanguage(ci);
+ String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<SurfaceForm> dbpslGraph = doPostRequest(text,ci.getUri());
if (dbpslGraph != null) {
@@ -349,55 +229,20 @@ public class DBPSpotlightCandidatesEnhan
*/
protected void createEnhancements(Collection<SurfaceForm> occs,
ContentItem ci, String text, Language language) {
- LiteralFactory literalFactory = LiteralFactory.getInstance();
// TODO create TextEnhancement (form, start, end, type?)
HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>();
MGraph model = ci.getMetadata();
for (SurfaceForm occ : occs) {
- UriRef textAnnotation = EnhancementEngineHelper
- .createTextEnhancement(ci, this);
- // model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef(
- // occ.types )));
- // for autotagger use the name instead of the matched term (that
- // might be a pronoun!)
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
- new PlainLiteralImpl(occ.name, language)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_START,
- literalFactory.createTypedLiteral(occ.offset)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_END,
- literalFactory.createTypedLiteral(occ.offset
- + occ.name.length())));
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
- new PlainLiteralImpl(
- getSelectionContext(text, occ.name, occ.offset),
- language)));
-
+ UriRef textAnnotation = SpotlightEngineUtils.createTextEnhancement(
+ occ, this, ci, text, language);
Iterator<CandidateResource> resources = occ.resources.iterator();
while (resources.hasNext()) {
CandidateResource resource = resources.next();
- UriRef entityAnnotation = EnhancementEngineHelper
- .createEntityEnhancement(ci, this);
+ UriRef entityAnnotation = SpotlightEngineUtils.createEntityAnnotation(
+ resource, this, ci, textAnnotation);
entityAnnotationMap.put(resource.uri, entityAnnotation);
- Literal label = new PlainLiteralImpl(resource.label,
- new Language("en"));
- model.add(new TripleImpl(entityAnnotation, DC_RELATION,
- textAnnotation));
- model.add(new TripleImpl(entityAnnotation,
- ENHANCER_ENTITY_LABEL, label));
- model.add(new TripleImpl(entityAnnotation,
- ENHANCER_ENTITY_REFERENCE, new UriRef(resource.uri)));
- model.add(new TripleImpl(entityAnnotation, SPOTLIGHT_CONTEXTUAL_SCORE,
- literalFactory.createTypedLiteral(resource.contextualScore)));
- model.add(new TripleImpl(entityAnnotation,SPOTLIGHT_PERCENTAGE_OF_SECOND_RANK,
- literalFactory.createTypedLiteral(resource.percentageOfSecondRank)));
- model.add(new TripleImpl(entityAnnotation, SPOTLIGHT_SUPPORT, literalFactory
- .createTypedLiteral(resource.support)));
- model.add(new TripleImpl(entityAnnotation, SPOTLIGHT_PRIOR_SCORE, literalFactory
- .createTypedLiteral(resource.priorScore)));
- model.add(new TripleImpl(entityAnnotation, SPOTLIGHT_FINAL_SCORE, literalFactory
- .createTypedLiteral(resource.finalScore)));
}
if (entityAnnotationMap.containsKey(occ.name)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.name),
@@ -505,62 +350,7 @@ public class DBPSpotlightCandidatesEnhan
} finally {
IOUtils.closeQuietly(is);
}
- NodeList nlist = getElementsByTagName(xmlDoc,"surfaceForm");
- Collection<SurfaceForm> annos = this.getAnnotations(nlist);
- return annos;
- }
-
- /**
- * This method creates the Collection of surface forms, which the method
- * <code>createEnhancement</code> adds to the meta data of the content item
- * as TextAnnotations.
- *
- * @param nList
- * NodeList of all Resources contained in the XML response from
- * DBpedia Spotlight
- * @return a Collection<DBPSLSurfaceForm> with all annotations
- */
- private Collection<SurfaceForm> getAnnotations(NodeList nList) {
- Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>();
-
- for (int temp = 0; temp < nList.getLength(); temp++) {
- SurfaceForm dbpslann = new SurfaceForm();
- Element node = (Element) nList.item(temp);
- dbpslann.name = node.getAttribute("name");
- dbpslann.offset = (new Integer(node.getAttribute("offset")))
- .intValue();
- // dbpslann.type = node.getAttribute( "type" );
-
- NodeList resources = node.getChildNodes();
-
- for (int count = 0; count < resources.getLength(); count++) {
- Node n = resources.item(count);
- if (n instanceof Element) {
- Element r = (Element) n;
- CandidateResource resource = new CandidateResource();
- resource.label = r.getAttribute("label");
- resource.uri = r.getAttribute("uri");
- resource.contextualScore = (new Double(
- r.getAttribute("contextualScore"))).doubleValue();
- resource.percentageOfSecondRank = (new Double(
- r.getAttribute("percentageOfSecondRank")))
- .doubleValue();
- resource.support = (new Double(r.getAttribute("support")))
- .doubleValue();
- resource.priorScore = (new Double(
- r.getAttribute("priorScore"))).doubleValue();
- resource.finalScore = (new Double(
- r.getAttribute("finalScore"))).doubleValue();
- dbpslann.resources.add(resource);
- }
-
- // Element r = (Element) resources.item(count);
- }
-
- dbpslAnnos.add(dbpslann);
- }
-
- return dbpslAnnos;
+ return CandidateResource.parseCandidates(xmlDoc);
}
public Map<String, Object> getServiceProperties() {
@@ -568,38 +358,5 @@ public class DBPSpotlightCandidatesEnhan
ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
}
- private static final int DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
- /**
- * Extracts the selection context based on the content, selection and
- * the start char offset of the selection
- * @param content the content
- * @param selection the selected text
- * @param selectionStartPos the start char position of the selection
- * @return the context
- */
- protected static String getSelectionContext(String content, String selection,int selectionStartPos){
- //extract the selection context
- int beginPos;
- if(selectionStartPos <= DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
- beginPos = 0;
- } else {
- int start = selectionStartPos-DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- beginPos = content.indexOf(' ',start);
- if(beginPos < 0 || beginPos >= selectionStartPos){ //no words
- beginPos = start; //begin within a word
- }
- }
- int endPos;
- if(selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= content.length()){
- endPos = content.length();
- } else {
- int start = selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- endPos = content.lastIndexOf(' ', start);
- if(endPos <= selectionStartPos+selection.length()){
- endPos = start; //end within a word;
- }
- }
- return content.substring(beginPos, endPos);
- }
}
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java?rev=1376912&r1=1376397&r2=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java Fri Aug 24 13:48:52 2012
@@ -16,7 +16,14 @@
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_CONFIDENCE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_DISAMBIGUATOR;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_RESTRICTION;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPARQL;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SUPPORT;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_URL_KEY;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.UTF8;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.loadXMLFromInputStream;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
@@ -25,12 +32,11 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
-import java.io.BufferedReader;
+import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
-import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
@@ -39,44 +45,42 @@ import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.serializedform.Serializer;
+import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
+import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.utils.SpotlightEngineUtils;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
/**
* {@link DBPSpotlightDisambiguateEnhancementEngine} provides functionality to
@@ -84,31 +88,29 @@ import org.w3c.dom.NodeList;
*
* @author Iavor Jelev, Babelmonkeys (GzEvD)
*/
-@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.name", description = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.description")
+@Component(metatype = true, immediate = true,
+ label = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.name",
+ description = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.description")
@Service
-@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightdisambiguate") })
+@Properties(value = {
+ @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightdisambiguate"),
+ @Property(name = PARAM_URL_KEY, value = "http://spotlight.dbpedia.org/rest/annotate"),
+ @Property(name = PARAM_DISAMBIGUATOR, value = "Document"),
+ @Property(name = PARAM_RESTRICTION),
+ @Property(name = PARAM_SPARQL),
+ @Property(name = PARAM_SUPPORT),
+ @Property(name = PARAM_CONFIDENCE)
+})
public class DBPSpotlightDisambiguateEnhancementEngine extends
AbstractEnhancementEngine<IOException, RuntimeException> implements
EnhancementEngine, ServiceProperties {
- // all parameters which can be used to configure the EnhancementEngine
- @Property(value = "http://spotlight.dbpedia.org/rest/annotate")
- public static final String SL_URL_KEY = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.url";
-
- @Property(value = "Document")
- public static final String SL_DISAMBIGUATOR = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.disambiguator";
-
- @Property()
- public static final String SL_RESTRICTION = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.types";
-
- @Property()
- public static final String SL_SPARQL = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.sparql";
-
- @Property()
- public static final String SL_SUPPORT = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.support";
-
- @Property()
- public static final String SL_CONFIDENCE = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.confidence";
+ /**
+ * Ensures this engine is deactivated in {@link OfflineMode}
+ */
+ @SuppressWarnings("unused")
+ @Reference
+ private OnlineMode onlineMode;
/**
* The default value for the Execution of this Engine. Currently set to
@@ -116,19 +118,11 @@ public class DBPSpotlightDisambiguateEnh
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 31;
- /**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
- */
- private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
- /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */
- private static final Set<String> SUPPORTED_MIMTYPES = Collections
- .singleton(TEXT_PLAIN_MIMETYPE);
/** This contains the logger. */
private static final Logger log = LoggerFactory
.getLogger(DBPSpotlightDisambiguateEnhancementEngine.class);
/** holds the url of the Spotlight REST endpoint */
- private String spotlightUrl;
+ private URL spotlightUrl;
/** holds the chosen of disambiguator to be used */
private String spotlightDisambiguator;
/** holds the type restriction for the results, if the user wishes one */
@@ -144,7 +138,20 @@ public class DBPSpotlightDisambiguateEnh
* Spotlight, and later for linking of the results
*/
private Hashtable<String, UriRef> textAnnotationsMap;
-
+ /**
+ * Default constructor used by OSGI. It is expected that
+ * {@link #activate(ComponentContext)} is called before
+ * using the instance.
+ */
+ public DBPSpotlightDisambiguateEnhancementEngine(){}
+
+ /**
+ * Constructor intended to be used for unit tests
+ * @param serviceURL
+ */
+ protected DBPSpotlightDisambiguateEnhancementEngine(URL serviceURL){
+ this.spotlightUrl = serviceURL;
+ }
/**
* Initialize all parameters from the configuration panel, or with their
* default values
@@ -159,18 +166,17 @@ public class DBPSpotlightDisambiguateEnh
super.activate(ce);
Dictionary<String, Object> properties = ce.getProperties();
- spotlightUrl = properties.get(SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
- : (String) properties.get(SL_URL_KEY);
- spotlightDisambiguator = properties.get(SL_DISAMBIGUATOR) == null ? null
- : (String) properties.get(SL_DISAMBIGUATOR);
- spotlightTypesRestriction = properties.get(SL_RESTRICTION) == null ? null
- : (String) properties.get(SL_RESTRICTION);
- spotlightSparql = properties.get(SL_SPARQL) == null ? null
- : (String) properties.get(SL_SPARQL);
- spotlightSupport = properties.get(SL_SUPPORT) == null ? "-1"
- : (String) properties.get(SL_SUPPORT);
- spotlightConfidence = properties.get(SL_CONFIDENCE) == null ? "-1"
- : (String) properties.get(SL_CONFIDENCE);
+ spotlightUrl = SpotlightEngineUtils.parseSpotlightServiceURL(properties);
+ spotlightDisambiguator = properties.get(PARAM_DISAMBIGUATOR) == null ? null
+ : (String) properties.get(PARAM_DISAMBIGUATOR);
+ spotlightTypesRestriction = properties.get(PARAM_RESTRICTION) == null ? null
+ : (String) properties.get(PARAM_RESTRICTION);
+ spotlightSparql = properties.get(PARAM_SPARQL) == null ? null
+ : (String) properties.get(PARAM_SPARQL);
+ spotlightSupport = properties.get(PARAM_SUPPORT) == null ? "-1"
+ : (String) properties.get(PARAM_SUPPORT);
+ spotlightConfidence = properties.get(PARAM_CONFIDENCE) == null ? "-1"
+ : (String) properties.get(PARAM_CONFIDENCE);
}
/**
@@ -180,11 +186,8 @@ public class DBPSpotlightDisambiguateEnh
* the {@link ContentItem}
*/
public int canEnhance(ContentItem ci) throws EngineException {
- if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
- return ENHANCE_SYNCHRONOUS;
- } else {
- return CANNOT_ENHANCE;
- }
+ return SpotlightEngineUtils.canProcess(ci) ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
}
/**
@@ -195,37 +198,21 @@ public class DBPSpotlightDisambiguateEnh
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
- Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
- SUPPORTED_MIMTYPES);
- if (contentPart == null) {
- throw new IllegalStateException(
- "No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE
- + "' found for ContentItem "
- + ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text = "";
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
+ Language language = SpotlightEngineUtils.getContentLanguage(ci);
+ String text = SpotlightEngineUtils.getPlainContent(ci);
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
// Retrieve the existing text annotations (requires read lock)
MGraph graph = ci.getMetadata();
String xmlTextAnnotations = this.getSpottedXml(text, graph);
Collection<Annotation> dbpslGraph = doPostRequest(text,
- xmlTextAnnotations);
+ xmlTextAnnotations, ci.getUri());
if (dbpslGraph != null) {
// Acquire a write lock on the ContentItem when adding the
// enhancements
ci.getLock().writeLock().lock();
try {
- createEnhancements(dbpslGraph, ci);
+ createEnhancements(dbpslGraph, ci, language);
if (log.isDebugEnabled()) {
Serializer serializer = Serializer.getInstance();
ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
@@ -255,17 +242,7 @@ public class DBPSpotlightDisambiguateEnh
* the content item
*/
public void createEnhancements(Collection<Annotation> occs,
- ContentItem ci) {
- final Language language; // used for plain literals representing parts
- // fo the content
- String langString = getMetadataLanguage(ci.getMetadata(), null);
-
- if (langString != null && !langString.isEmpty()) {
- language = new Language(langString);
- } else {
- language = null;
- }
-
+ ContentItem ci, Language language) {
HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource, UriRef>();
for (Annotation occ : occs) {
@@ -276,13 +253,13 @@ public class DBPSpotlightDisambiguateEnh
UriRef entityAnnotation = EnhancementEngineHelper
.createEntityEnhancement(ci, this);
entityAnnotationMap.put(occ.uri, entityAnnotation);
- Literal label = new PlainLiteralImpl(occ.surfaceForm, language);
+ Literal label = new PlainLiteralImpl(occ.surfaceForm.name, language);
model.add(new TripleImpl(entityAnnotation, DC_RELATION,
textAnnotation));
model.add(new TripleImpl(entityAnnotation,
ENHANCER_ENTITY_LABEL, label));
- HashSet<String> t = occ.getTypeNames();
+ Collection<String> t = occ.getTypeNames();
if (t != null) {
Iterator<String> it = t.iterator();
while (it.hasNext())
@@ -302,52 +279,18 @@ public class DBPSpotlightDisambiguateEnh
* a <code>String</code> with the text to be analyzed
* @param xmlTextAnnotations
* @param textAnnotations
+ * @param contentItemUri the URI of the {@link ContentItem} (only
+ * used for logging in case of an error)
* @return a <code>String</code> with the server response
* @throws EngineException
* if the request cannot be sent
*/
- public Collection<Annotation> doPostRequest(String text,
- String xmlTextAnnotations) throws EngineException {
- StringBuilder data = new StringBuilder();
-
- try {
- data.append(URLEncoder.encode("spotter=SpotXmlParser", "UTF-8")
- + "&");
- if (spotlightDisambiguator != null
- && !spotlightDisambiguator.isEmpty())
- data.append(URLEncoder.encode("disambiguator", "UTF-8") + "="
- + URLEncoder.encode(spotlightDisambiguator, "UTF-8")
- + "&");
- if (spotlightTypesRestriction != null
- && !spotlightTypesRestriction.isEmpty())
- data.append(URLEncoder.encode("types", "UTF-8") + "="
- + URLEncoder.encode(spotlightTypesRestriction, "UTF-8")
- + "&");
- if (spotlightSupport != null && !spotlightSupport.isEmpty())
- data.append(URLEncoder.encode("support", "UTF-8") + "="
- + URLEncoder.encode(spotlightSupport, "UTF-8") + "&");
- if (spotlightConfidence != null && !spotlightConfidence.isEmpty())
- data.append(URLEncoder.encode("confidence", "UTF-8") + "="
- + URLEncoder.encode(spotlightConfidence, "UTF-8") + "&");
- if (spotlightSparql != null && !spotlightSparql.isEmpty()
- && spotlightTypesRestriction == null)
- data.append(URLEncoder.encode("sparql", "UTF-8") + "="
- + URLEncoder.encode(spotlightSparql, "UTF-8") + "&");
- data.append(URLEncoder.encode("text", "UTF-8") + "="
- + URLEncoder.encode(xmlTextAnnotations, "UTF-8"));
- } catch (UnsupportedEncodingException e) {
- throw new EngineException(
- "Data for the httprequest could not be converted. Error: "
- + e.getMessage());
- }
-
+ protected Collection<Annotation> doPostRequest(String text,
+ String xmlTextAnnotations, UriRef contentItemUri) throws EngineException {
HttpURLConnection connection = null;
- StringBuffer response = new StringBuffer();
-
+ BufferedWriter wr = null;
try {
- // Create connection
- URL url = new URL(spotlightUrl);
- connection = (HttpURLConnection) url.openConnection();
+ connection = (HttpURLConnection) spotlightUrl.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("Content-Type",
"application/x-www-form-urlencoded");
@@ -358,45 +301,74 @@ public class DBPSpotlightDisambiguateEnh
connection.setDoOutput(true);
// Send request
- DataOutputStream wr = new DataOutputStream(
- connection.getOutputStream());
- wr.writeBytes(data.toString());
- wr.flush();
- wr.close();
+ wr = new BufferedWriter(new OutputStreamWriter(
+ connection.getOutputStream(),UTF8));
+ } catch (IOException e) {
+ IOUtils.closeQuietly(wr);
+ throw new EngineException("Unable to open connection to "+
+ spotlightUrl,e);
+ }
+ try {
- // Get Response
- InputStream is = connection.getInputStream();
- BufferedReader rd = new BufferedReader(new InputStreamReader(is));
- String line;
- while ((line = rd.readLine()) != null) {
- response.append(line);
- response.append('\r');
+ wr.write("spotter=SpotXmlParser&");
+ if (spotlightDisambiguator != null
+ && !spotlightDisambiguator.isEmpty()){
+ wr.write("disambiguator=");
+ wr.write(URLEncoder.encode(spotlightDisambiguator, "UTF-8"));
+ wr.write('&');
}
- rd.close();
-
- } catch (Exception e) {
- log.error("[request - error] The following error occurred: "
- + e.getMessage());
-
- } finally {
-
- if (connection != null) {
- connection.disconnect();
+ if (spotlightTypesRestriction != null
+ && !spotlightTypesRestriction.isEmpty()){
+ wr.write("types=");
+ wr.write(URLEncoder.encode(spotlightTypesRestriction, "UTF-8"));
+ wr.write('&');
+ }
+ if (spotlightSupport != null && !spotlightSupport.isEmpty()) {
+ wr.write("support=");
+ wr.write(URLEncoder.encode(spotlightSupport, "UTF-8"));
+ wr.write('&');
+ }
+ if (spotlightConfidence != null && !spotlightConfidence.isEmpty()){
+ wr.write("confidence=");
+ wr.write(URLEncoder.encode(spotlightConfidence, "UTF-8"));
+ wr.write('&');
+ }
+ if (spotlightSparql != null && !spotlightSparql.isEmpty()
+ && spotlightTypesRestriction == null) {
+ wr.write("sparql=");
+ wr.write(URLEncoder.encode(spotlightSparql, "UTF-8"));
+ wr.write('&');
}
+ wr.write("text=");
+ wr.write(URLEncoder.encode(xmlTextAnnotations, "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new IllegalStateException(
+ "The platform does not support encoding " + UTF8.name(),e);
+ } catch (IOException e) {
+ throw new EngineException("Unable to write 'plain/text' content "
+ + "for ContentItem "+contentItemUri+" to "
+ + spotlightUrl,e);
+ } finally {
+ IOUtils.closeQuietly(wr);
}
-
- XMLParser xmlParser = new XMLParser();
+ InputStream is = null;
+ Document xmlDoc;
try {
- Document xmlDoc = xmlParser.loadXMLFromString(response.toString());
- NodeList nlist = xmlParser.getElementsByTagName(xmlDoc, "Resource");
- Collection<Annotation> annos = this.getAnnotations(nlist);
-
- return annos;
- } catch (Exception e) {
- throw new EngineException(
- "Response XML could not be parsed. Error: "
- + e.getMessage());
+ // Get Response
+ is = connection.getInputStream();
+ xmlDoc = loadXMLFromInputStream(is);
+ } catch (IOException e) {
+ throw new EngineException("Unable to spot Entities with"
+ + "Dbpedia Spotlight Annotate RESTful Serice running at "
+ + spotlightUrl,e);
+ } catch(SAXException e) {
+ throw new EngineException("Unable to parse Response from "
+ + "Dbpedia Spotlight Annotate RESTful Serice running at "
+ + spotlightUrl,e);
+ } finally {
+ IOUtils.closeQuietly(is);
}
+ return Annotation.parseAnnotations(xmlDoc);
}
private String getSpottedXml(String text, MGraph graph) {
@@ -427,71 +399,10 @@ public class DBPSpotlightDisambiguateEnh
return xml.append("</annotation>").toString();
}
- /**
- * This method creates the Collection of Annotations, which the method
- * <code>createEnhancement</code> adds to the meta data of the content item.
- *
- * @param nList
- * NodeList of all Resources contained in the XML response from
- * DBpedia Spotlight
- * @return a Collection<DBPSLAnnotation> with all annotations
- */
- private Collection<Annotation> getAnnotations(NodeList nList) {
- Collection<Annotation> dbpslAnnos = new HashSet<Annotation>();
-
- for (int temp = 0; temp < nList.getLength(); temp++) {
- Annotation dbpslann = new Annotation();
- Element node = (Element) nList.item(temp);
- dbpslann.uri = new UriRef(node.getAttribute("URI"));
- dbpslann.support = (new Integer(node.getAttribute("support")))
- .intValue();
- dbpslann.types = node.getAttribute("types");
- dbpslann.surfaceForm = node.getAttribute("surfaceForm");
- dbpslann.offset = (new Integer(node.getAttribute("offset")))
- .intValue();
- dbpslann.similarityScore = (new Double(
- node.getAttribute("similarityScore"))).doubleValue();
- dbpslann.percentageOfSecondRank = (new Double(
- node.getAttribute("percentageOfSecondRank"))).doubleValue();
-
- dbpslAnnos.add(dbpslann);
- }
-
- return dbpslAnnos;
- }
-
public Map<String, Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(
ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
}
- public String getMetadataLanguage(MGraph model, NonLiteral subj) {
- Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
- if (it.hasNext()) {
- Resource langNode = it.next().getObject();
- return getLexicalForm(langNode);
- }
- return null;
- }
-
- public String getLexicalForm(Resource res) {
- if (res == null) {
- return null;
- } else if (res instanceof Literal) {
- return ((Literal) res).getLexicalForm();
- } else {
- return res.toString();
- }
- }
-
- /**
- * This method is used by the test class to set the endpoint url
- *
- * @param url
- * String the url of the Spotlight endpoint
- */
- public void setEndpointUrl(String url) {
- spotlightUrl = url;
- }
}