You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/24 15:48:54 UTC
svn commit: r1376912 [2/2] - in
/incubator/stanbol/branches/dbpedia-spotlight-engines: ./
bundlelist/src/main/bundles/ engines/ engines/dbpedia-spotlight-annotate/
engines/dbpedia-spotlight-candidates/
engines/dbpedia-spotlight-disambiguate/ engines/db...
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/Annotation.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/Annotation.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/Annotation.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/Annotation.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.model;
+
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.getElementsByTagName;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * Contains a result given by DBPedia Spotlight..
+ *
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public class Annotation {
+
+ /*
+ * TODO (Note by rwesten 2012-08-22)
+ *
+ * Added here functionality to extract DBpedia
+ * Ontoloty types for Annotations. This is mainly to
+ * choose the best dc:type for fise:TextAnnotations
+ * created for Annotation.
+ *
+ * This is based on the assumption that the most generic
+ * dbpedia type is always the last one in the returned list.
+ *
+ * In addition "DBpedia:TopicalConcept" is ignored first
+ * as it seams not to be used by dbpedia.org and second
+ * because it is always parsed last (even after schema
+ * and freebase types) and would therefore be considered
+ * as the most generic dbpedia type.
+ *
+ * I do not like this solution and would like to find
+ * a better solution for that
+ */
+ /**
+ * Allows to add DBpedia Ontology types that should be
+ * ignored by {@link #getDbpediaTypeNames()}.<p>
+ * Introduced this to ignore the "TopicalConcept"
+ * type.
+ */
+ public static final Set<String> IGNORED_DBP_TYPES;
+ static {
+ Set<String> ignored = new HashSet<String>();
+ ignored.add("DBpedia:TopicalConcept");
+ IGNORED_DBP_TYPES = Collections.unmodifiableSet(ignored);
+ }
+
+ public Resource uri;
+ //TODO: change this to a list with the parsed types
+ // Processing of XML results should be done during parsing
+ public String types;
+ public Integer support;
+ //NOTE rwesten: changed this to embed a SurfaceFrom so that i
+ // can reuse code for creating fise:TextAnnotations
+ public SurfaceForm surfaceForm;
+ public Double similarityScore;
+ public Double percentageOfSecondRank;
+
+ public List<String> getTypeNames() {
+ if (types != null) {
+ List<String> t = new ArrayList<String>();
+ String[] typex = types.split(",");
+ for (String type : typex) {
+ // make the returned types referenceable
+ String deref = type.replace("DBpedia:", "http://dbpedia.org/ontology/")
+ .replace("Freebase:", "http://www.freebase.com/schema")
+ .replace("Schema:", "http://www.schema.org/");
+ if(!deref.isEmpty()){
+ t.add(deref);
+ }
+ }
+ return t;
+ }
+ return Collections.emptyList();
+ }
+
+ /**
+ * Getter for the dbpedia ontology types excluding {@link #IGNORED_DBP_TYPES}
+ * @return the types or an empty list if none
+ */
+ public List<String> getDbpediaTypeNames(){
+ if (types != null) {
+ List<String> t = new ArrayList<String>();
+ String[] typex = types.split(",");
+ for (String type : typex) {
+ if(!IGNORED_DBP_TYPES.contains(type) && type.startsWith("DBpedia:")){
+ t.add(type.replace("DBpedia:", "http://dbpedia.org/ontology/"));
+ }
+ }
+ return t;
+ }
+ return Collections.emptyList();
+ }
+
+ public String toString() {
+ return String
+ .format("[uri=%s, support=%i, types=%s, surfaceForm=\"%s\", similarityScore=%d, percentageOfSecondRank=%d]",
+ uri, support, types, surfaceForm,
+ similarityScore, percentageOfSecondRank);
+ }
+
+ /**
+ * This method parses allAnnotations from the parsed XML {@link Document}
+ *
+ * @param xmlDoc
+ * A XML document containing annotations.
+ * @return a Collection<DBPSLAnnotation> with all annotations
+ */
+ public static Collection<Annotation> parseAnnotations(Document xmlDoc) {
+ NodeList nList = getElementsByTagName(xmlDoc, "Resource");
+ Collection<Annotation> dbpslAnnos = new HashSet<Annotation>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ Annotation dbpslann = new Annotation();
+ Element node = (Element) nList.item(temp);
+ dbpslann.uri = new UriRef(node.getAttribute("URI"));
+ dbpslann.support = (new Integer(node.getAttribute("support")))
+ .intValue();
+ dbpslann.types = node.getAttribute("types");
+ dbpslann.surfaceForm = new SurfaceForm();
+ dbpslann.surfaceForm.name = node.getAttribute("surfaceForm");
+ dbpslann.surfaceForm.offset = (new Integer(node.getAttribute("offset")))
+ .intValue();
+ //set the type of the surface form
+ List<String> dbpediaTypes = dbpslann.getDbpediaTypeNames();
+ if(!dbpediaTypes.isEmpty()){
+ //set the last type in the list - the most general one - as type
+ //for the surface form
+ dbpslann.surfaceForm.type = dbpediaTypes.get(dbpediaTypes.size()-1);
+ }
+ dbpslann.similarityScore = (new Double(
+ node.getAttribute("similarityScore"))).doubleValue();
+ dbpslann.percentageOfSecondRank = (new Double(
+ node.getAttribute("percentageOfSecondRank"))).doubleValue();
+
+ dbpslAnnos.add(dbpslann);
+ }
+
+ return dbpslAnnos;
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/CandidateResource.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/CandidateResource.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/CandidateResource.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/CandidateResource.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.model;
+
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.getElementsByTagName;
+
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+//import org.apache.clerezza.rdf.core.Resource;
+
+/**
+ * Stores the candidate ressources given by DBPedia Spotlight Candidates.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public class CandidateResource {
+
+ public String label;
+ public String uri;
+ public double contextualScore;
+ public double percentageOfSecondRank;
+ public double support;
+ public double priorScore;
+ public double finalScore;
+
+ public String toString() {
+ return String
+ .format("[label=%s, uri=%s, contextualScore=%d, percentageOfSecondRank=%d, contextualScore=%d, "
+ + "percentageOfSecondRank=%d, contextualScore=%d]",
+ label, uri, contextualScore, percentageOfSecondRank,
+ support, priorScore, finalScore);
+ }
+
+ /**
+ * This method creates the Collection of surface forms, which the method
+ * <code>createEnhancement</code> adds to the meta data of the content item
+ * as TextAnnotations.
+ *
+ * @param nList
+ * NodeList of all Resources contained in the XML response from
+ * DBpedia Spotlight
+ * @return a Collection<DBPSLSurfaceForm> with all annotations
+ */
+ public static Collection<SurfaceForm> parseCandidates(Document xmlDoc) {
+ NodeList nList = getElementsByTagName(xmlDoc,"surfaceForm");
+ Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ Element node = (Element) nList.item(temp);
+ SurfaceForm dbpslann = SurfaceForm.parseSerfaceForm(node);
+
+ NodeList resources = node.getChildNodes();
+
+ for (int count = 0; count < resources.getLength(); count++) {
+ Node n = resources.item(count);
+ if (n instanceof Element) {
+ Element r = (Element) n;
+ CandidateResource resource = new CandidateResource();
+ resource.label = r.getAttribute("label");
+ resource.uri = r.getAttribute("uri");
+ resource.contextualScore = (new Double(
+ r.getAttribute("contextualScore"))).doubleValue();
+ resource.percentageOfSecondRank = (new Double(
+ r.getAttribute("percentageOfSecondRank")))
+ .doubleValue();
+ resource.support = (new Double(r.getAttribute("support")))
+ .doubleValue();
+ resource.priorScore = (new Double(
+ r.getAttribute("priorScore"))).doubleValue();
+ resource.finalScore = (new Double(
+ r.getAttribute("finalScore"))).doubleValue();
+ dbpslann.resources.add(resource);
+ }
+
+ // Element r = (Element) resources.item(count);
+ }
+
+ dbpslAnnos.add(dbpslann);
+ }
+
+ return dbpslAnnos;
+ }
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/SurfaceForm.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/SurfaceForm.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/SurfaceForm.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/model/SurfaceForm.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.model;
+
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.getElementsByTagName;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+
+//import org.apache.clerezza.rdf.core.Resource;
+
+/**
+ * Stores the surface forms given by DBPedia Spotlight Candidates.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public class SurfaceForm {
+
+ public String name;
+ public String type;
+ public Integer offset;
+ public List<CandidateResource> resources = new ArrayList<CandidateResource>();
+
+ public String toString() {
+ return String.format("[name=%s, offset=%i, type=%s]", name, offset,
+ type);
+ }
+
+ /**
+ * Parses all {@link SurfaceForm} data from the parsed XML document
+ *
+ * @param xmlDoc
+ * The XML Document containing the surface forms
+ * @return a Collection<DBPSLSurfaceForm> with all annotations
+ */
+ public static Collection<SurfaceForm> parseSurfaceForm(Document xmlDoc) {
+ NodeList nList = getElementsByTagName(xmlDoc,"surfaceForm");
+ Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ Element node = (Element) nList.item(temp);
+ SurfaceForm dbpslann = parseSerfaceForm(node);
+
+ dbpslAnnos.add(dbpslann);
+ }
+
+ return dbpslAnnos;
+ }
+
+ protected static SurfaceForm parseSerfaceForm(Element node) {
+ SurfaceForm dbpslann = new SurfaceForm();
+ dbpslann.name = node.getAttribute("name");
+ dbpslann.offset = (new Integer(node.getAttribute("offset")))
+ .intValue();
+ dbpslann.type = node.getAttribute("type");
+ return dbpslann;
+ }
+
+
+}
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java Fri Aug 24 13:48:52 2012
@@ -16,14 +16,11 @@
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.spot;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.spot.XMLParser.getElementsByTagName;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.spot.XMLParser.loadXMLFromInputStream;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_SPOTTER;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_URL_KEY;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.UTF8;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.utils.XMLParser.loadXMLFromInputStream;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
@@ -32,27 +29,17 @@ import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
-import java.nio.charset.Charset;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Map;
-import java.util.Map.Entry;
-import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
-import org.apache.clerezza.rdf.core.Literal;
-import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.serializedform.Serializer;
import org.apache.commons.io.IOUtils;
@@ -63,22 +50,18 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OfflineMode;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
-import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.utils.SpotlightEngineUtils;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
@@ -87,17 +70,19 @@ import org.xml.sax.SAXException;
*
* @author Iavor Jelev, Babelmonkeys (GzEvD)
*/
-@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name", description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description")
+@Component(metatype = true, immediate = true,
+ label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name",
+ description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description")
@Service
@Properties(value = {
- @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightspot")
+ @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightspot"),
+ @Property(name = PARAM_URL_KEY, value = "http://spotlight.dbpedia.org/rest/spot"),
+ @Property(name = PARAM_SPOTTER)
})
public class DBPSpotlightSpotEnhancementEngine extends
AbstractEnhancementEngine<IOException, RuntimeException> implements
EnhancementEngine, ServiceProperties {
- private static final Charset UTF8 = Charset.forName("UTF-8");
-
/**
* Ensures this engine is deactivated in {@link OfflineMode}
*/
@@ -106,38 +91,11 @@ public class DBPSpotlightSpotEnhancement
private OnlineMode onlineMode;
/**
- * a configurable value of the text segment length to check
- */
- @Property(value = "http://spotlight.dbpedia.org/rest/spot")
- public static final String SL_URL_KEY = "stanbol.DBPSpotlightSpotEnhancementEngine.url";
-
- @Property(value = "LingPipeSpotter")
- public static final String SL_SPOTTER = "stanbol.DBPSpotlightSpotEnhancementEngine.spotter";
-
- /**
* The default value for the Execution of this Engine. Currently set to
* <code>{@link ServiceProperties#ORDERING_CONTENT_EXTRACTION} - 29</code>
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 29;
- /**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
- */
- private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
- /**
- * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
- */
- private static final Set<String> SUPPORTED_MIMTYPES = Collections
- .singleton(TEXT_PLAIN_MIMETYPE);
-
- /**
- * This contains a list of languages supported by DBpedia Spotlight. If the
- * metadata doesn't contain a value for the language as the value of the
- * {@link Property.DC_LANG property} the content can't be processed.
- */
- protected static final Set<String> SUPPORTED_LANGUAGES = Collections
- .unmodifiableSet(new HashSet<String>(Arrays.asList("en")));
/** holds the logger. */
private static final Logger log = LoggerFactory
@@ -172,24 +130,12 @@ public class DBPSpotlightSpotEnhancement
super.activate(ce);
Dictionary<String, Object> properties = ce.getProperties();
- Object value = properties.get(SL_URL_KEY);
- if(value == null || value.toString().isEmpty()){
- throw new ConfigurationException(SL_URL_KEY, "The URL with the DBpedia "
- + "Spotlight Spot RESTful Service MUST NOT be NULL nor empty!");
- } else {
- String url = (String) properties.get(SL_URL_KEY);
- try {
- this.spotlightUrl = new URL(url);
- } catch (MalformedURLException e) {
- throw new ConfigurationException(SL_URL_KEY, "The parsed URL for the "
- + "DBpedia Spotlight Spot RESTful Service is illegal formatted!",
- e);
- }
- }
+ spotlightUrl = SpotlightEngineUtils.parseSpotlightServiceURL(properties);
+
//also set the spotter to null if an empty string is parsed
- value = properties.get(SL_SPOTTER);
- spotlightSpotter = value != null && !value.toString().isEmpty() ?
- value.toString() : null;
+ Object spotterConfig = properties.get(PARAM_SPOTTER);
+ spotlightSpotter = spotterConfig != null && !spotterConfig.toString().isEmpty() ?
+ spotterConfig.toString() : null;
}
/**
@@ -199,23 +145,8 @@ public class DBPSpotlightSpotEnhancement
* the {@link ContentItem}
*/
public int canEnhance(ContentItem ci) throws EngineException {
- if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
- String language = EnhancementEngineHelper.getLanguage(ci);
- if (!SUPPORTED_LANGUAGES.contains(language)) {
- log.info("DBpedia Spotlight can not process ContentItem {} "
- + "because language {} is not supported (supported: {})",
- new Object[] { ci.getUri(), language, SUPPORTED_LANGUAGES });
- return CANNOT_ENHANCE;
- }
- //rwesten: ASYNC support is highly recommended for engines that
- // do call remote services
- return ENHANCE_ASYNC;
- } else {
- log.info("DBpedia Spotlight can not process ContentItem {} "
- + "because it does not have 'plain/text' content",
- ci.getUri());
- return CANNOT_ENHANCE;
- }
+ return SpotlightEngineUtils.canProcess(ci) ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
}
/**
@@ -226,35 +157,8 @@ public class DBPSpotlightSpotEnhancement
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
- Language language;
- String lang = EnhancementEngineHelper.getLanguage(ci);
- if(!SUPPORTED_LANGUAGES.contains(lang)){
- throw new IllegalStateException("Langage '"+lang
- + "' as annotated for ContentItem "
- + ci.getUri() + " is not supported by this Engine: "
- + "This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- } else {
- language = lang == null || lang.isEmpty() ? null : new Language(lang);
- }
- Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
- SUPPORTED_MIMTYPES);
- if (contentPart == null) {
- throw new IllegalStateException("No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE
- + "' found for ContentItem "
- + ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text = "";
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
+ Language language = SpotlightEngineUtils.getContentLanguage(ci);
+ String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<SurfaceForm> dbpslGraph = doPostRequest(text,ci.getUri());
if (dbpslGraph != null) {
@@ -292,27 +196,13 @@ public class DBPSpotlightSpotEnhancement
*/
protected void createEnhancements(Collection<SurfaceForm> occs,
ContentItem ci, String content, Language lang) {
- LiteralFactory literalFactory = LiteralFactory.getInstance();
HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>();
MGraph model = ci.getMetadata();
for (SurfaceForm occ : occs) {
- UriRef textAnnotation = EnhancementEngineHelper
- .createTextEnhancement(ci, this);
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
- new PlainLiteralImpl(occ.name, lang)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_START,
- literalFactory.createTypedLiteral(occ.offset)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_END,
- literalFactory.createTypedLiteral(occ.offset
- + occ.name.length())));
- model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef(
- occ.type)));
- model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
- new PlainLiteralImpl(
- getSelectionContext(content, occ.name, occ.offset),
- lang)));
+ UriRef textAnnotation = SpotlightEngineUtils.createTextEnhancement(
+ occ, this, ci, content, lang);
if (entityAnnotationMap.containsKey(occ.name)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.name),
DC_RELATION, textAnnotation));
@@ -322,6 +212,7 @@ public class DBPSpotlightSpotEnhancement
}
}
+
/**
* Sends a POST request to the DBpediaSpotlight url.
*
@@ -407,93 +298,13 @@ public class DBPSpotlightSpotEnhancement
}
//rwesten: commented the disconnect to allow keep-alive
//connection.disconnect();
- NodeList nlist = getElementsByTagName(xmlDoc,"surfaceForm");
- return getAnnotations(nlist);
+ return SurfaceForm.parseSurfaceForm(xmlDoc);
}
- /**
- * This method creates the Collection of surface forms, which the method
- * <code>createEnhancement</code> adds to the meta data of the content item
- * as TextAnnotations.
- *
- * @param nList
- * NodeList of all Resources contained in the XML response from
- * DBpedia Spotlight
- * @return a Collection<DBPSLSurfaceForm> with all annotations
- */
- private Collection<SurfaceForm> getAnnotations(NodeList nList) {
- Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>();
-
- for (int temp = 0; temp < nList.getLength(); temp++) {
- SurfaceForm dbpslann = new SurfaceForm();
- Element node = (Element) nList.item(temp);
- dbpslann.name = node.getAttribute("name");
- dbpslann.offset = (new Integer(node.getAttribute("offset")))
- .intValue();
- dbpslann.type = node.getAttribute("type");
- dbpslAnnos.add(dbpslann);
- }
-
- return dbpslAnnos;
- }
public Map<String, Object> getServiceProperties() {
return Collections.unmodifiableMap(Collections.singletonMap(
ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
}
-// rwesten: Use the Utility provided by the EnhancementEngineHelper instead
-// public String getMetadataLanguage(MGraph model, NonLiteral subj) {
-// Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
-// if (it.hasNext()) {
-// Resource langNode = it.next().getObject();
-// return getLexicalForm(langNode);
-// }
-// return null;
-// }
-
-// rwesten: unused
-// public String getLexicalForm(Resource res) {
-// if (res == null) {
-// return null;
-// } else if (res instanceof Literal) {
-// return ((Literal) res).getLexicalForm();
-// } else {
-// return res.toString();
-// }
-// }
-
- private static final int DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
- /**
- * Extracts the selection context based on the content, selection and
- * the start char offset of the selection
- * @param content the content
- * @param selection the selected text
- * @param selectionStartPos the start char position of the selection
- * @return the context
- */
- protected static String getSelectionContext(String content, String selection,int selectionStartPos){
- //extract the selection context
- int beginPos;
- if(selectionStartPos <= DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
- beginPos = 0;
- } else {
- int start = selectionStartPos-DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- beginPos = content.indexOf(' ',start);
- if(beginPos < 0 || beginPos >= selectionStartPos){ //no words
- beginPos = start; //begin within a word
- }
- }
- int endPos;
- if(selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= content.length()){
- endPos = content.length();
- } else {
- int start = selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
- endPos = content.lastIndexOf(' ', start);
- if(endPos <= selectionStartPos+selection.length()){
- endPos = start; //end within a word;
- }
- }
- return content.substring(beginPos, endPos);
- }
}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/SpotlightEngineUtils.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/SpotlightEngineUtils.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/SpotlightEngineUtils.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/SpotlightEngineUtils.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,285 @@
+package org.apache.stanbol.enhancer.engines.dbpspotlight.utils;
+
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PARAM_URL_KEY;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_CONTEXTUAL_SCORE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_FINAL_SCORE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_PERCENTAGE_OF_SECOND_RANK;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_PRIOR_SCORE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_SIMILARITY_SCORE;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.PROPERTY_SUPPORT;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.SUPPORTED_LANGUAGES;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.SUPPORTED_MIMTYPES;
+import static org.apache.stanbol.enhancer.engines.dbpspotlight.Constants.TEXT_PLAIN_MIMETYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Dictionary;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.Constants;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.CandidateResource;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Shared utilities for the Spotlight Enhancement Engines.
+ */
+public class SpotlightEngineUtils {
+
+ private static final Logger log = LoggerFactory.getLogger(SpotlightEngineUtils.class);
+
+ private static final LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ private static final int DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
+
+ public static boolean canProcess(ContentItem ci){
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if(!SUPPORTED_LANGUAGES.contains(language)) {
+ log.info("DBpedia Spotlight can not process ContentItem {} "
+ + "because language {} is not supported (supported: {})",
+ new Object[] { ci.getUri(), language, SUPPORTED_LANGUAGES });
+ return false;
+ } else {
+ return true;
+ }
+ } else {
+ log.info("DBpedia Spotlight can not process ContentItem {} "
+ + "because it does not have 'plain/text' content",
+ ci.getUri());
+ return false;
+ }
+ }
+ public static Language getContentLanguage(ContentItem ci) {
+ String lang = EnhancementEngineHelper.getLanguage(ci);
+ if(!SUPPORTED_LANGUAGES.contains(lang)){
+ throw new IllegalStateException("Langage '"+lang
+ + "' as annotated for ContentItem "
+ + ci.getUri() + " is not supported by this Engine: "
+ + "This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ } else {
+ return lang == null || lang.isEmpty() ? null : new Language(lang);
+ }
+ }
+ public static String getPlainContent(ContentItem ci)
+ throws EngineException {
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
+ SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException(
+ "No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE
+ + "' found for ContentItem "
+ + ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ try {
+ return ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new EngineException("Unable to read plain text content form" +
+ "contentpart "+contentPart.getKey()+" of ContentItem " +
+ ci.getUri());
+ }
+ }
+ /**
+ * Parses the URL from the {@link Constants#PARAM_URL_KEY}
+ * @param properties the configuration of the engine
+ * @return the URL of the service
+ * @throws ConfigurationException if the configuration is missing,
+ * empty or not a valid URL
+ */
+ public static URL parseSpotlightServiceURL(
+ Dictionary<String, Object> properties)
+ throws ConfigurationException {
+ Object value = properties.get(PARAM_URL_KEY);
+ if(value == null || value.toString().isEmpty()){
+ throw new ConfigurationException(PARAM_URL_KEY, "The URL with the DBpedia "
+ + "Spotlight Annotate RESTful Service MUST NOT be NULL nor empty!");
+ } else {
+ try {
+ return new URL(value.toString());
+ } catch (MalformedURLException e) {
+ throw new ConfigurationException(PARAM_URL_KEY, "The parsed URL for the "
+ + "DBpedia Spotlight Annotate RESTful Service is illegal formatted!",
+ e);
+ }
+ }
+ }
+ /**
+ * Extracts the selection context based on the content, selection and
+ * the start char offset of the selection
+ * @param content the content
+ * @param selection the selected text
+ * @param selectionStartPos the start char position of the selection
+ * @return the context
+ */
+ public static String getSelectionContext(String content, String selection,int selectionStartPos){
+ //extract the selection context
+ int beginPos;
+ if(selectionStartPos <= DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
+ beginPos = 0;
+ } else {
+ int start = selectionStartPos-DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
+ beginPos = content.indexOf(' ',start);
+ if(beginPos < 0 || beginPos >= selectionStartPos){ //no words
+ beginPos = start; //begin within a word
+ }
+ }
+ int endPos;
+ if(selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= content.length()){
+ endPos = content.length();
+ } else {
+ int start = selectionStartPos+selection.length()+DEFAULT_SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
+ endPos = content.lastIndexOf(' ', start);
+ if(endPos <= selectionStartPos+selection.length()){
+ endPos = start; //end within a word;
+ }
+ }
+ return content.substring(beginPos, endPos);
+ }
+ /**
+ * Creates a fise:TextAnnotation for the parsed parameters and
+ * adds it the the {@link ContentItem#getMetadata()}. <p>
+ * This method assumes a write lock on the parsed content item.
+ * @param occ the SurfaceForm
+ * @param engine the Engine
+ * @param ci the ContentITem
+ * @param content the content
+ * @param lang the language of the content or <code>null</code>
+ * @return the URI of the created fise:TextAnnotation
+ */
+ public static UriRef createTextEnhancement(SurfaceForm occ,
+ EnhancementEngine engine, ContentItem ci, String content,
+ Language lang) {
+ MGraph model = ci.getMetadata();
+ UriRef textAnnotation = EnhancementEngineHelper
+ .createTextEnhancement(ci, engine);
+ model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(occ.name, lang)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_START,
+ literalFactory.createTypedLiteral(occ.offset)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_END,
+ literalFactory.createTypedLiteral(occ.offset
+ + occ.name.length())));
+ if(occ.type != null && !occ.type.isEmpty()){
+ model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef(
+ occ.type)));
+ }
+ model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(
+ getSelectionContext(content, occ.name, occ.offset),
+ lang)));
+ return textAnnotation;
+ }
+ /**
+ * Creates a fise:EntityAnnotation for the parsed parameters and
+ * adds it the the {@link ContentItem#getMetadata()}. <p>
+ * This method assumes a write lock on the parsed content item.
+ * @param resource the candidate resource
+ * @param engine the engine
+ * @param ci the content item
+ * @param textAnnotation the fise:TextAnnotation to dc:relate the
+ * created fise:EntityAnnotation
+ * @return the URI of the created fise:TextAnnotation
+ */
+ public static UriRef createEntityAnnotation(CandidateResource resource,
+ EnhancementEngine engine, ContentItem ci, UriRef textAnnotation) {
+ UriRef entityAnnotation = EnhancementEngineHelper
+ .createEntityEnhancement(ci, engine);
+ MGraph model = ci.getMetadata();
+ Literal label = new PlainLiteralImpl(resource.label,
+ new Language("en"));
+ model.add(new TripleImpl(entityAnnotation, DC_RELATION,
+ textAnnotation));
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_LABEL, label));
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_REFERENCE, new UriRef(resource.uri)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_CONTEXTUAL_SCORE,
+ literalFactory.createTypedLiteral(resource.contextualScore)));
+ model.add(new TripleImpl(entityAnnotation,PROPERTY_PERCENTAGE_OF_SECOND_RANK,
+ literalFactory.createTypedLiteral(resource.percentageOfSecondRank)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_SUPPORT, literalFactory
+ .createTypedLiteral(resource.support)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_PRIOR_SCORE, literalFactory
+ .createTypedLiteral(resource.priorScore)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_FINAL_SCORE, literalFactory
+ .createTypedLiteral(resource.finalScore)));
+ return entityAnnotation;
+ }
+ /**
+ * Creates a fise:EntityAnnotation for the parsed parameter and
+ * adds it the the {@link ContentItem#getMetadata()}. <p>
+ * This method assumes a write lock on the parsed content item.
+ * @param annotation the Annotation
+ * @param engine the engine
+ * @param ci the language
+ * @param textAnnotation the TextAnnotation the created
+ * EntityAnnotation links by using dc:relation
+ * @param language the language of the label of the referenced
+ * Entity (or <code>null</code> if none).
+ */
+ public static void createEntityAnnotation(Annotation annotation,
+ EnhancementEngine engine, ContentItem ci,
+ UriRef textAnnotation, Language language) {
+ MGraph model = ci.getMetadata();
+ UriRef entityAnnotation = EnhancementEngineHelper
+ .createEntityEnhancement(ci, engine);
+ Literal label = new PlainLiteralImpl(annotation.surfaceForm.name,
+ language);
+ model.add(new TripleImpl(entityAnnotation, DC_RELATION,
+ textAnnotation));
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_LABEL, label));
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_REFERENCE, annotation.uri));
+ //set the fise:entity-type
+ for(String type : annotation.getTypeNames()){
+ UriRef annotationType = new UriRef(type);
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_TYPE, annotationType));
+ }
+ //TODO (rwesten): Pleas check: I use the similarityScore as fise:confidence value
+ model.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, literalFactory
+ .createTypedLiteral(annotation.similarityScore)));
+ //add spotlight specific information
+ model.add(new TripleImpl(entityAnnotation,PROPERTY_PERCENTAGE_OF_SECOND_RANK,
+ literalFactory.createTypedLiteral(annotation.percentageOfSecondRank)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_SUPPORT, literalFactory
+ .createTypedLiteral(annotation.support)));
+ model.add(new TripleImpl(entityAnnotation, PROPERTY_SIMILARITY_SCORE, literalFactory
+ .createTypedLiteral(annotation.similarityScore)));
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/XMLParser.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/XMLParser.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/utils/XMLParser.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.utils;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Parses the XML results given by DBPedia Spotlight.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public final class XMLParser {
+
+ /**
+ * Do not create instances of Utility Classes
+ */
+ private XMLParser(){};
+
+ public static NodeList getElementsByTagName(Document doc, String tagName) {
+
+ return doc.getElementsByTagName(tagName);
+ }
+
+ public static Document loadXMLFromString(String xml) throws SAXException,
+ IOException {
+ Document doc = loadXMLFromInputStream(new ByteArrayInputStream(
+ xml.getBytes()));
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public static Document loadXMLFromInputStream(InputStream is) throws SAXException,
+ IOException {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setNamespaceAware(true);
+ DocumentBuilder builder = null;
+ try {
+ builder = factory.newDocumentBuilder();
+ } catch (ParserConfigurationException ex) {
+ }
+ Document doc = builder.parse(is);
+ is.close();
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public static Document loadXMLFromFile(String filePath)
+ throws ParserConfigurationException, SAXException, IOException {
+ File fXmlFile = new File(filePath);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(fXmlFile);
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+}
\ No newline at end of file
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1376912&r1=1376397&r2=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Aug 24 13:48:52 2012
@@ -21,22 +21,50 @@
# descriptions as used in the metatype.xml descriptor generated by the
# the maven SCR plugin
-stanbol.DBPSpotlightAnnotateEnhancementEngine.name = DBpedia Spotlight Annotate: Named Entity Extraction and Ontology Linking
-stanbol.DBPSpotlightAnnotateEnhancementEngine.description = Find names of people, organization, \
- places... disambiguate and link them to DBpedia Ontology URIs. This is a complete EnhancementChain, all in one Engine.
-stanbol.DBPSpotlightAnnotateEnhancementEngine.url.name = Spotlight URL
-stanbol.DBPSpotlightAnnotateEnhancementEngine.url.description = The URL which will be used for the request
-stanbol.DBPSpotlightAnnotateEnhancementEngine.spotter.name = Spotter
-stanbol.DBPSpotlightAnnotateEnhancementEngine.spotter.description = The algorithm which will be used for Spotting \
+# Request Properties (shared by all Engines)
+
+dbpedia.spotlight.url.name = Spotlight URL
+dbpedia.spotlight.url.description = The URL which will be used for the request
+dbpedia.spotlight.spotter.name = Spotter
+dbpedia.spotlight.spotter.description = The algorithm which will be used for Spotting \
(aka Term Recognition). Currently available: NER, LingPipeSpotter, OpenNLPChunkerSpotter, Kea
-stanbol.DBPSpotlightAnnotateEnhancementEngine.disambiguator.name = Disambiguator
-stanbol.DBPSpotlightAnnotateEnhancementEngine.disambiguator.description = The algorithm used for ranking of senses \
+dbpedia.spotlight.disambiguator.name = Disambiguator
+dbpedia.spotlight.disambiguator.description = The algorithm used for ranking of senses \
based on context. Currently available: Document, Occurrences
-stanbol.DBPSpotlightAnnotateEnhancementEngine.types.name = Types Restriction
-stanbol.DBPSpotlightAnnotateEnhancementEngine.types.description = The DBpedia Ontology types you wish to restrict your results to
-stanbol.DBPSpotlightAnnotateEnhancementEngine.sparql.name = Sparql
-stanbol.DBPSpotlightAnnotateEnhancementEngine.sparql.description = Restrict the result with SPARQL
-stanbol.DBPSpotlightAnnotateEnhancementEngine.support.name = Support
-stanbol.DBPSpotlightAnnotateEnhancementEngine.support.description = Filter the results based on a support metric
-stanbol.DBPSpotlightAnnotateEnhancementEngine.confidence.name = Confidence
-stanbol.DBPSpotlightAnnotateEnhancementEngine.confidence.description = Filter the results based on a confidence metric
+dbpedia.spotlight.types.name = Types Restriction
+dbpedia.spotlight.types.description = The DBpedia Ontology types you wish to restrict your results to
+dbpedia.spotlight.sparql.name = Sparql
+dbpedia.spotlight.sparql.description = Restrict the result with SPARQL
+dbpedia.spotlight.support.name = Support
+dbpedia.spotlight.support.description = Filter the results based on a support metric
+dbpedia.spotlight.confidence.name = Confidence
+dbpedia.spotlight.confidence.description = Filter the results based on a confidence metric
+
+
+#Annotate
+
+dbpedia.spotlight.name = DBpedia Spotlight Annotate: Named Entity Extraction and Ontology Linking
+dbpedia.spotlight.description = Find names of people, organization, \
+ places... disambiguate and link them to DBpedia Ontology URIs. This is a complete EnhancementChain, all in one Engine.
+
+
+# SPOT
+
+stanbol.DBPSpotlightSpotEnhancementEngine.name = DBpedia Spotlight Spotter: Named Entity Recognition
+stanbol.DBPSpotlightSpotEnhancementEngine.description = This engine performs just Named Entity Recognition, \
+ so it is suited for EnhancementChain scenario, in which another Engine links the recognized TextAnnotations \
+ to Ontology Types
+
+# Candidates
+
+stanbol.DBPSpotlightCandidatesEnhancementEngine.name = DBpedia Spotlight Candidates: Named Entity Extraction and Ontology Linking
+stanbol.DBPSpotlightCandidatesEnhancementEngine.description = Find names of people, organization, \
+ places... disambiguate and link them to DBpedia Ontology URIs. The difference to the DBPSpotlightAnnotateEnhancementEngine is that \
+ all candidate URIs for a given TextAnnotation are delivered, as opposed to just the top K
+
+
+#Disambiguate
+
+stanbol.DBPSpotlightDisambiguateEnhancementEngine.name = DBpedia Spotlight Disambiguate: Disambiguation and Ontology Linking
+stanbol.DBPSpotlightDisambiguateEnhancementEngine.description = It uses TextAnnotations added by a Spotter, so it can only be used \
+ in an EnhancementChain context. It disambiguates and links them to DBpedia Ontology URIs.
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpspotlight.config
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpspotlight.config?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpspotlight.config (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/main/resources/config/org.apache.stanbol.enhancer.chain.weighted.impl.WeightedChain-dbpspotlight.config Fri Aug 24 13:48:52 2012
@@ -0,0 +1,2 @@
+stanbol.enhancer.chain.name="dbpedia-spotlight"
+stanbol.enhancer.chain.weighted.chain=["tika;optional","metaxa;optional","langdetect","dbpspotlightannotate"]
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-annotate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/annotate/DBPSpotlightAnnotateEnhancementTest.java Fri Aug 24 13:48:52 2012
@@ -33,8 +33,8 @@ import org.apache.clerezza.rdf.core.UriR
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.annotate.Annotation;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.annotate.DBPSpotlightAnnotateEnhancementEngine;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.Constants;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -48,7 +48,6 @@ import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.osgi.service.cm.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -66,9 +65,9 @@ public class DBPSpotlightAnnotateEnhance
private static final Logger LOG = LoggerFactory
.getLogger(DBPSpotlightAnnotateEnhancementTest.class);
private static String SPL_URL = System
- .getProperty(DBPSpotlightAnnotateEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
+ .getProperty(Constants.PARAM_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
: (String) System
- .getProperty(DBPSpotlightAnnotateEnhancementEngine.SL_URL_KEY);
+ .getProperty(Constants.PARAM_URL_KEY);
private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday";
private static DBPSpotlightAnnotateEnhancementEngine dbpslight;
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-candidates/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/candidates/DBPSpotlightCandidatesEnhancementTest.java Fri Aug 24 13:48:52 2012
@@ -33,8 +33,8 @@ import org.apache.clerezza.rdf.core.UriR
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.candidates.DBPSpotlightCandidatesEnhancementEngine;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.candidates.SurfaceForm;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.Constants;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -48,7 +48,6 @@ import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.osgi.service.cm.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -66,9 +65,9 @@ public class DBPSpotlightCandidatesEnhan
private static final Logger LOG = LoggerFactory
.getLogger(DBPSpotlightCandidatesEnhancementTest.class);
private static String SPL_URL = System
- .getProperty(DBPSpotlightCandidatesEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/candidates"
+ .getProperty(Constants.PARAM_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/candidates"
: (String) System
- .getProperty(DBPSpotlightCandidatesEnhancementEngine.SL_URL_KEY);
+ .getProperty(Constants.PARAM_URL_KEY);
private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday.";
private static DBPSpotlightCandidatesEnhancementEngine dbpslight;
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementTest.java?rev=1376912&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementTest.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementTest.java Fri Aug 24 13:48:52 2012
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate;
+
+import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.ENHANCE_ASYNC;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.Constants;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides a JUnit test for DBpedia Spotlight Annotate
+ * EnhancementEngine.
+ *
+ * @author Iavor Jelev, babelmonkeys / GzEvD
+ */
+public class DBPSpotlightDisambiguateEnhancementTest {
+
+ /**
+ * This contains the logger.
+ */
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DBPSpotlightDisambiguateEnhancementTest.class);
+ private static String SPL_URL = System
+ .getProperty(Constants.PARAM_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
+ : (String) System
+ .getProperty(Constants.PARAM_URL_KEY);
+ private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday.";
+ private static DBPSpotlightDisambiguateEnhancementEngine dbpslight;
+ private static String testFile = "spots.xml";
+ private static String spotsXml;
+
+ private static ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
+
+ private ContentItem ci;
+ private static Entry<UriRef, Blob> textContentPart;
+
+ @BeforeClass
+ public static void oneTimeSetup() throws Exception {
+ dbpslight = new DBPSpotlightDisambiguateEnhancementEngine(new URL(SPL_URL));
+ }
+
+ @Before
+ public void initTest() throws IOException {
+ //create the contentItem for testing
+ ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
+ assertNotNull(ci);
+ textContentPart = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
+ assertNotNull(textContentPart);
+ //add the language of the text
+ ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE,
+ new PlainLiteralImpl("en")));
+ assertEquals("en", EnhancementEngineHelper.getLanguage(ci));
+
+ LiteralFactory lf = LiteralFactory.getInstance();
+
+ //we need also to create a fise:TextAnnotation to test disambiguation
+ String selected = "Angela Merkel";
+ Language en = new Language("en");
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci,
+ new DBPSpotlightSpotEnhancementEngine());
+ MGraph model = ci.getMetadata();
+ model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(selected,en)));
+ model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(TEST_TEXT,en)));
+ model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START,
+ lf.createTypedLiteral(TEST_TEXT.indexOf(selected))));
+ model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END,
+ lf.createTypedLiteral(TEST_TEXT.indexOf(selected)+selected.length())));
+ model.add(new TripleImpl(textAnnotation, Properties.DC_TYPE,
+ OntologicalClasses.DBPEDIA_PERSON));
+ //validate that the created TextAnnotation is valid (test the test ...)
+ EnhancementStructureHelper.validateAllTextAnnotations(model, TEST_TEXT, null);
+ }
+
+ @Test
+ public void testEntityExtraction() {
+ Collection<Annotation> entities;
+ try {
+ spotsXml = IOUtils.toString(this.getClass().getClassLoader()
+ .getResourceAsStream(testFile));
+ System.out.println(SPL_URL);
+ entities = dbpslight.doPostRequest(TEST_TEXT, spotsXml,ci.getUri());
+ LOG.info("Found entities: {}", entities.size());
+ LOG.debug("Entities:\n{}", entities);
+ Assert.assertFalse("No entities were found!", entities.isEmpty());
+ } catch (Exception e) {
+ Assert.assertFalse("An EngineException occurred! The message was: "
+ + e.getMessage(), true);
+ }
+ }
+ @Test
+ public void testCanEnhance() throws EngineException {
+ assertEquals(ENHANCE_ASYNC, dbpslight.canEnhance(ci));
+ }
+
+ /**
+ * Validates the Enhancements created by this engine
+ * @throws EngineException
+ */
+ @Test
+ public void testEnhancement() throws EngineException {
+ dbpslight.computeEnhancements(ci);
+ HashMap<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
+ expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
+ expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
+ dbpslight.getClass().getName()));
+ //validate fise:EntityAnnotations
+ EnhancementStructureHelper.validateAllEntityAnnotations(
+ ci.getMetadata(), expectedValues);
+ }
+}
Copied: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java (from r1376420, incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java)
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java?p2=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java&p1=incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java&r1=1376420&r2=1376912&rev=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementTest.java Fri Aug 24 13:48:52 2012
@@ -16,7 +16,6 @@
*/
package org.apache.stanbol.enhancer.engines.dbpspotlight.spot;
-import static org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine.SL_URL_KEY;
import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.ENHANCE_ASYNC;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -35,23 +34,20 @@ import org.apache.clerezza.rdf.core.UriR
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine;
-import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.SurfaceForm;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.Constants;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
-import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
-import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
-import org.osgi.service.cm.ConfigurationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -69,9 +65,9 @@ public class DBPSpotlightSpotEnhancement
private static final Logger LOG = LoggerFactory
.getLogger(DBPSpotlightSpotEnhancementTest.class);
private static String SPL_URL = System
- .getProperty(SL_URL_KEY) == null ?
+ .getProperty(Constants.PARAM_URL_KEY) == null ?
"http://spotlight.dbpedia.org/rest/spot" :
- (String) System.getProperty(SL_URL_KEY);
+ (String) System.getProperty(Constants.PARAM_URL_KEY);
private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday";
private static DBPSpotlightSpotEnhancementEngine dbpslight;
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml?rev=1376912&r1=1376911&r2=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml Fri Aug 24 13:48:52 2012
@@ -57,9 +57,6 @@
<module>opencalais</module> <!-- http://opencalais.com/ -->
<module>zemanta</module> <!-- htt://zemanta.com -->
<!-- DBpedia.org Spotlight Enhancement Engines (STANBOL-706) -->
- <module>dbpedia-spotlight-annotate</module>
- <module>dbpedia-spotlight-candidates</module>
- <module>dbpedia-spotlight-disambiguate</module>
- <module>dbpedia-spotlight-spot</module>
+ <module>dbpedia-spotlight</module>
</modules>
</project>
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/generic/test/src/main/java/org/apache/stanbol/enhancer/test/helper/EnhancementStructureHelper.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/generic/test/src/main/java/org/apache/stanbol/enhancer/test/helper/EnhancementStructureHelper.java?rev=1376912&r1=1376911&r2=1376912&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/generic/test/src/main/java/org/apache/stanbol/enhancer/test/helper/EnhancementStructureHelper.java (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/generic/test/src/main/java/org/apache/stanbol/enhancer/test/helper/EnhancementStructureHelper.java Fri Aug 24 13:48:52 2012
@@ -420,7 +420,8 @@ public class EnhancementStructureHelper
XSD.dateTime.equals(((TypedLiteral)createdResource).getDataType()));
Date creationDate = LiteralFactory.getInstance().createObject(Date.class, (TypedLiteral)createdResource);
assertNotNull("Unable to convert "+createdResource+" to a Java Date object",creationDate);
- assertTrue("CreationDate MUST NOT be in the Future",new Date().after(creationDate));
+ Date now = new Date();
+ assertTrue("CreationDate MUST NOT be in the Future",now.after(creationDate) || now.equals(creationDate));
assertFalse("Only a single createnDate MUST BE present", createdIterator.hasNext());
//validate optional modification date if present
Iterator<Triple> modDateIterator = enhancements.filter(enhancement, DCTERMS.modified, null);