You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/20 19:14:58 UTC
svn commit: r1375110 [2/2] - in
/incubator/stanbol/branches/dbpedia-spotlight-engines/engines: ./
dbpedia-spotlight-annotate/
dbpedia-spotlight-annotate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/
dbpedia-spotlight-annotate/src/main...
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.clerezza.rdf.core.serializedform.Serializer;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * {@link DBPSpotlightDisambiguateEnhancementEngine} provides functionality to
+ * enhance document with their language.
+ *
+ * @author Iavor Jelev, Babelmonkeys (GzEvD)
+ */
+@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.name", description = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.description")
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightdisambiguate") })
+public class DBPSpotlightDisambiguateEnhancementEngine extends
+ AbstractEnhancementEngine<IOException, RuntimeException> implements
+ EnhancementEngine, ServiceProperties {
+
+ // all parameters which can be used to configure the EnhancementEngine
+ @Property(value = "http://spotlight.dbpedia.org/rest/annotate")
+ public static final String SL_URL_KEY = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.url";
+
+ @Property(value = "Document")
+ public static final String SL_DISAMBIGUATOR = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.disambiguator";
+
+ @Property()
+ public static final String SL_RESTRICTION = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.types";
+
+ @Property()
+ public static final String SL_SPARQL = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.sparql";
+
+ @Property()
+ public static final String SL_SUPPORT = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.support";
+
+ @Property()
+ public static final String SL_CONFIDENCE = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.confidence";
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
+ */
+ public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 31;
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections
+ .singleton(TEXT_PLAIN_MIMETYPE);
+ /** This contains the logger. */
+ private static final Logger log = LoggerFactory
+ .getLogger(DBPSpotlightDisambiguateEnhancementEngine.class);
+ /** holds the url of the Spotlight REST endpoint */
+ private String spotlightUrl;
+ /** holds the chosen of disambiguator to be used */
+ private String spotlightDisambiguator;
+ /** holds the type restriction for the results, if the user wishes one */
+ private String spotlightTypesRestriction;
+ /** holds the chosen minimal support value */
+ private String spotlightSupport;
+ /** holds the chosen minimal confidence value */
+ private String spotlightConfidence;
+ /** holds the sparql restriction for the results, if the user wishes one */
+ private String spotlightSparql;
+ /**
+ * holds the existing TextAnnotations, which are used as input for DBpedia
+ * Spotlight, and later for linking of the results
+ */
+ private Hashtable<String, UriRef> textAnnotationsMap;
+
+ /**
+ * Initialize all parameters from the configuration panel, or with their
+ * default values
+ *
+ * @param ce
+ * the {@link ComponentContext}
+ */
+ @SuppressWarnings("unchecked")
+ protected void activate(ComponentContext ce) throws ConfigurationException,
+ IOException {
+
+ super.activate(ce);
+
+ Dictionary<String, Object> properties = ce.getProperties();
+ spotlightUrl = properties.get(SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
+ : (String) properties.get(SL_URL_KEY);
+ spotlightDisambiguator = properties.get(SL_DISAMBIGUATOR) == null ? null
+ : (String) properties.get(SL_DISAMBIGUATOR);
+ spotlightTypesRestriction = properties.get(SL_RESTRICTION) == null ? null
+ : (String) properties.get(SL_RESTRICTION);
+ spotlightSparql = properties.get(SL_SPARQL) == null ? null
+ : (String) properties.get(SL_SPARQL);
+ spotlightSupport = properties.get(SL_SUPPORT) == null ? "-1"
+ : (String) properties.get(SL_SUPPORT);
+ spotlightConfidence = properties.get(SL_CONFIDENCE) == null ? "-1"
+ : (String) properties.get(SL_CONFIDENCE);
+ }
+
+ /**
+ * Check if the content can be enhanced
+ *
+ * @param ci
+ * the {@link ContentItem}
+ */
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
+ return ENHANCE_SYNCHRONOUS;
+ } else {
+ return CANNOT_ENHANCE;
+ }
+ }
+
+ /**
+ * Calculate the enhancements by doing a POST request to the DBpedia
+ * Spotlight endpoint and processing the results
+ *
+ * @param ci
+ * the {@link ContentItem}
+ */
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
+ SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException(
+ "No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE
+ + "' found for ContentItem "
+ + ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+
+ // Retrieve the existing text annotations (requires read lock)
+ MGraph graph = ci.getMetadata();
+ String xmlTextAnnotations = this.getSpottedXml(text, graph);
+ Collection<Annotation> dbpslGraph = doPostRequest(text,
+ xmlTextAnnotations);
+ if (dbpslGraph != null) {
+ // Acquire a write lock on the ContentItem when adding the
+ // enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ createEnhancements(dbpslGraph, ci);
+ if (log.isDebugEnabled()) {
+ Serializer serializer = Serializer.getInstance();
+ ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+ serializer.serialize(debugStream, ci.getMetadata(),
+ "application/rdf+xml");
+ try {
+ log.debug("DBpedia Enhancements:\n{}",
+ debugStream.toString("UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+ }
+
+ /**
+ * The method adds the returned DBpedia Spotlight annotations to the content
+ * item's metadata. For each DBpedia resource an EntityAnnotation is created
+ * and linked to the according TextAnnotation.
+ *
+ * @param occs
+ * a Collection of entity information
+ * @param ci
+ * the content item
+ */
+ public void createEnhancements(Collection<Annotation> occs,
+ ContentItem ci) {
+ final Language language; // used for plain literals representing parts
+ // fo the content
+ String langString = getMetadataLanguage(ci.getMetadata(), null);
+
+ if (langString != null && !langString.isEmpty()) {
+ language = new Language(langString);
+ } else {
+ language = null;
+ }
+
+ HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource, UriRef>();
+
+ for (Annotation occ : occs) {
+
+ if (textAnnotationsMap.get(occ.surfaceForm) != null) {
+ UriRef textAnnotation = textAnnotationsMap.get(occ.surfaceForm);
+ MGraph model = ci.getMetadata();
+ UriRef entityAnnotation = EnhancementEngineHelper
+ .createEntityEnhancement(ci, this);
+ entityAnnotationMap.put(occ.uri, entityAnnotation);
+ Literal label = new PlainLiteralImpl(occ.surfaceForm, language);
+ model.add(new TripleImpl(entityAnnotation, DC_RELATION,
+ textAnnotation));
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_LABEL, label));
+
+ HashSet<String> t = occ.getTypeNames();
+ if (t != null) {
+ Iterator<String> it = t.iterator();
+ while (it.hasNext())
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_TYPE, new UriRef(it.next())));
+ }
+ model.add(new TripleImpl(entityAnnotation,
+ ENHANCER_ENTITY_REFERENCE, occ.uri));
+ }
+ }
+ }
+
+ /**
+ * Sends a POST request to the DBpediaSpotlight url.
+ *
+ * @param text
+ * a <code>String</code> with the text to be analyzed
+ * @param xmlTextAnnotations
+ * @param textAnnotations
+ * @return a <code>String</code> with the server response
+ * @throws EngineException
+ * if the request cannot be sent
+ */
+ public Collection<Annotation> doPostRequest(String text,
+ String xmlTextAnnotations) throws EngineException {
+ StringBuilder data = new StringBuilder();
+
+ try {
+ data.append(URLEncoder.encode("spotter=SpotXmlParser", "UTF-8")
+ + "&");
+ if (spotlightDisambiguator != null
+ && !spotlightDisambiguator.isEmpty())
+ data.append(URLEncoder.encode("disambiguator", "UTF-8") + "="
+ + URLEncoder.encode(spotlightDisambiguator, "UTF-8")
+ + "&");
+ if (spotlightTypesRestriction != null
+ && !spotlightTypesRestriction.isEmpty())
+ data.append(URLEncoder.encode("types", "UTF-8") + "="
+ + URLEncoder.encode(spotlightTypesRestriction, "UTF-8")
+ + "&");
+ if (spotlightSupport != null && !spotlightSupport.isEmpty())
+ data.append(URLEncoder.encode("support", "UTF-8") + "="
+ + URLEncoder.encode(spotlightSupport, "UTF-8") + "&");
+ if (spotlightConfidence != null && !spotlightConfidence.isEmpty())
+ data.append(URLEncoder.encode("confidence", "UTF-8") + "="
+ + URLEncoder.encode(spotlightConfidence, "UTF-8") + "&");
+ if (spotlightSparql != null && !spotlightSparql.isEmpty()
+ && spotlightTypesRestriction == null)
+ data.append(URLEncoder.encode("sparql", "UTF-8") + "="
+ + URLEncoder.encode(spotlightSparql, "UTF-8") + "&");
+ data.append(URLEncoder.encode("text", "UTF-8") + "="
+ + URLEncoder.encode(xmlTextAnnotations, "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new EngineException(
+ "Data for the httprequest could not be converted. Error: "
+ + e.getMessage());
+ }
+
+ HttpURLConnection connection = null;
+ StringBuffer response = new StringBuffer();
+
+ try {
+ // Create connection
+ URL url = new URL(spotlightUrl);
+ connection = (HttpURLConnection) url.openConnection();
+ connection.setRequestMethod("POST");
+ connection.setRequestProperty("Content-Type",
+ "application/x-www-form-urlencoded");
+ connection.setRequestProperty("Accept", "text/xml");
+
+ connection.setUseCaches(false);
+ connection.setDoInput(true);
+ connection.setDoOutput(true);
+
+ // Send request
+ DataOutputStream wr = new DataOutputStream(
+ connection.getOutputStream());
+ wr.writeBytes(data.toString());
+ wr.flush();
+ wr.close();
+
+ // Get Response
+ InputStream is = connection.getInputStream();
+ BufferedReader rd = new BufferedReader(new InputStreamReader(is));
+ String line;
+ while ((line = rd.readLine()) != null) {
+ response.append(line);
+ response.append('\r');
+ }
+ rd.close();
+
+ } catch (Exception e) {
+ log.error("[request - error] The following error occurred: "
+ + e.getMessage());
+
+ } finally {
+
+ if (connection != null) {
+ connection.disconnect();
+ }
+ }
+
+ XMLParser xmlParser = new XMLParser();
+ try {
+ Document xmlDoc = xmlParser.loadXMLFromString(response.toString());
+ NodeList nlist = xmlParser.getElementsByTagName(xmlDoc, "Resource");
+ Collection<Annotation> annos = this.getAnnotations(nlist);
+
+ return annos;
+ } catch (Exception e) {
+ throw new EngineException(
+ "Response XML could not be parsed. Error: "
+ + e.getMessage());
+ }
+ }
+
+ private String getSpottedXml(String text, MGraph graph) {
+ StringBuilder xml = new StringBuilder();
+ textAnnotationsMap = new Hashtable<String, UriRef>();
+
+ xml.append(String.format("<annotation text=\"%s\">", text));
+ try {
+ for (Iterator<Triple> it = graph.filter(null, RDF_TYPE,
+ TechnicalClasses.ENHANCER_TEXTANNOTATION); it.hasNext();) {
+ // Triple tAnnotation = it.next();
+ UriRef uri = (UriRef) it.next().getSubject();
+ String surfaceForm = EnhancementEngineHelper.getString(graph,
+ uri, ENHANCER_SELECTED_TEXT);
+ if (surfaceForm != null) {
+ String offset = EnhancementEngineHelper.getString(graph,
+ uri, ENHANCER_START);
+ textAnnotationsMap.put(surfaceForm, uri);
+ xml.append(String.format(
+ "<surfaceForm name=\"%s\" offset=\"%s\"/>",
+ surfaceForm, offset));
+ }
+ }
+ } catch (Exception e) {
+ log.error(e.getMessage());
+ }
+
+ return xml.append("</annotation>").toString();
+ }
+
+ /**
+ * This method creates the Collection of Annotations, which the method
+ * <code>createEnhancement</code> adds to the meta data of the content item.
+ *
+ * @param nList
+ * NodeList of all Resources contained in the XML response from
+ * DBpedia Spotlight
+ * @return a Collection<DBPSLAnnotation> with all annotations
+ */
+ private Collection<Annotation> getAnnotations(NodeList nList) {
+ Collection<Annotation> dbpslAnnos = new HashSet<Annotation>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ Annotation dbpslann = new Annotation();
+ Element node = (Element) nList.item(temp);
+ dbpslann.uri = new UriRef(node.getAttribute("URI"));
+ dbpslann.support = (new Integer(node.getAttribute("support")))
+ .intValue();
+ dbpslann.types = node.getAttribute("types");
+ dbpslann.surfaceForm = node.getAttribute("surfaceForm");
+ dbpslann.offset = (new Integer(node.getAttribute("offset")))
+ .intValue();
+ dbpslann.similarityScore = (new Double(
+ node.getAttribute("similarityScore"))).doubleValue();
+ dbpslann.percentageOfSecondRank = (new Double(
+ node.getAttribute("percentageOfSecondRank"))).doubleValue();
+
+ dbpslAnnos.add(dbpslann);
+ }
+
+ return dbpslAnnos;
+ }
+
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(
+ ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+ public String getMetadataLanguage(MGraph model, NonLiteral subj) {
+ Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource langNode = it.next().getObject();
+ return getLexicalForm(langNode);
+ }
+ return null;
+ }
+
+ public String getLexicalForm(Resource res) {
+ if (res == null) {
+ return null;
+ } else if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+ /**
+ * This method is used by the test class to set the endpoint url
+ *
+ * @param url
+ * String the url of the Spotlight endpoint
+ */
+ public void setEndpointUrl(String url) {
+ spotlightUrl = url;
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Parses the XML results given by DBPedia Spotlight.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+
+public class XMLParser {
+
+ public NodeList getElementsByTagName(Document doc, String tagName) {
+
+ return doc.getElementsByTagName(tagName);
+ }
+
+ public Document loadXMLFromString(String xml) throws SAXException,
+ IOException {
+ Document doc = loadXMLFromInputStream(new ByteArrayInputStream(
+ xml.getBytes()));
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public Document loadXMLFromInputStream(InputStream is) throws SAXException,
+ IOException {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setNamespaceAware(true);
+ DocumentBuilder builder = null;
+ try {
+ builder = factory.newDocumentBuilder();
+ } catch (ParserConfigurationException ex) {
+ }
+ Document doc = builder.parse(is);
+ is.close();
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public Document loadXMLFromFile(String filePath)
+ throws ParserConfigurationException, SAXException, IOException {
+ File fXmlFile = new File(filePath);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(fXmlFile);
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+}
\ No newline at end of file
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.core;
+
+import java.util.Collection;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.Annotation;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.DBPSpotlightDisambiguateEnhancementEngine;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides a JUnit test for DBpedia Spotlight Annotate
+ * EnhancementEngine.
+ *
+ * @author Iavor Jelev, babelmonkeys / GzEvD
+ */
+public class DBPSpotlightDisambiguateEnhancementTest {
+
+ /**
+ * This contains the logger.
+ */
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DBPSpotlightDisambiguateEnhancementTest.class);
+ private static String SPL_URL = System
+ .getProperty(DBPSpotlightDisambiguateEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate"
+ : (String) System
+ .getProperty(DBPSpotlightDisambiguateEnhancementEngine.SL_URL_KEY);
+ private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday.";
+ private static DBPSpotlightDisambiguateEnhancementEngine dbpslight;
+ private static String testFile = "spots.xml";
+ private static String spotsXml;
+
+ @BeforeClass
+ public static void oneTimeSetup() throws ConfigurationException {
+ dbpslight = new DBPSpotlightDisambiguateEnhancementEngine();
+ dbpslight.setEndpointUrl(SPL_URL);
+ }
+
+ @Test
+ public void testEntityExtraction() {
+ Collection<Annotation> entities;
+ try {
+ spotsXml = IOUtils.toString(this.getClass().getClassLoader()
+ .getResourceAsStream(testFile));
+ System.out.println(SPL_URL);
+ entities = dbpslight.doPostRequest(TEST_TEXT, spotsXml);
+ LOG.info("Found entities: {}", entities.size());
+ LOG.debug("Entities:\n{}", entities);
+ Assert.assertFalse("No entities were found!", entities.isEmpty());
+ } catch (Exception e) {
+ Assert.assertFalse("An EngineException occurred! The message was: "
+ + e.getMessage(), true);
+ }
+ }
+}
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml?rev=1375110&r1=1375107&r2=1375110&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml Mon Aug 20 17:14:56 2012
@@ -22,7 +22,7 @@
</parent>
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlightspot</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.spot</artifactId>
<packaging>bundle</packaging>
<name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight Spot</name>
@@ -43,7 +43,7 @@
<configuration>
<instructions>
<Export-Package>
- org.apache.stanbol.enhancer.engines.dbpspotlightspot;version=${project.version}
+ org.apache.stanbol.enhancer.engines.dbpspotlight.spot;version=${project.version}
</Export-Package>
<Embed-Dependency>
</Embed-Dependency>
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.spot;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.clerezza.rdf.core.serializedform.Serializer;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * {@link DBPSpotlightSpotEnhancementEngine} provides functionality to enhance
+ * document with their language.
+ *
+ * @author Iavor Jelev, Babelmonkeys (GzEvD)
+ */
+@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name", description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description")
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightspot") })
+public class DBPSpotlightSpotEnhancementEngine extends
+ AbstractEnhancementEngine<IOException, RuntimeException> implements
+ EnhancementEngine, ServiceProperties {
+
+ /**
+ * a configurable value of the text segment length to check
+ */
+ @Property(value = "http://spotlight.dbpedia.org/rest/spot")
+ public static final String SL_URL_KEY = "stanbol.DBPSpotlightSpotEnhancementEngine.url";
+
+ @Property(value = "LingPipeSpotter")
+ public static final String SL_SPOTTER = "stanbol.DBPSpotlightSpotEnhancementEngine.spotter";
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
+ */
+ public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 29;
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections
+ .singleton(TEXT_PLAIN_MIMETYPE);
+
+ /**
+ * This contains a list of languages supported by DBpedia Spotlight. If the
+ * metadata doesn't contain a value for the language as the value of the
+ * {@link Property.DC_LANG property} the content can't be processed.
+ */
+ protected static final Set<String> SUPPORTED_LANGUAGES = Collections
+ .unmodifiableSet(new HashSet<String>(Arrays.asList("en")));
+
+ /** holds the logger. */
+ private static final Logger log = LoggerFactory
+ .getLogger(DBPSpotlightSpotEnhancementEngine.class);
+
+ /** holds the url of the Spotlight REST endpoint */
+ private String spotlightUrl;
+ /** holds the chosen of spotter to be used */
+ private String spotlightSpotter;
+
+ /**
+ * Initialize all parameters from the configuration panel, or with their
+ * default values
+ *
+ * @param ce
+ * the {@link ComponentContext}
+ */
+ @SuppressWarnings("unchecked")
+ protected void activate(ComponentContext ce) throws ConfigurationException,
+ IOException {
+
+ super.activate(ce);
+
+ Dictionary<String, Object> properties = ce.getProperties();
+ spotlightUrl = properties.get(SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/spot"
+ : (String) properties.get(SL_URL_KEY);
+ spotlightSpotter = properties.get(SL_SPOTTER) == null ? null
+ : (String) properties.get(SL_SPOTTER);
+ }
+
+ /**
+ * Check if the content can be enhanced
+ *
+ * @param ci
+ * the {@link ContentItem}
+ */
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
+ String language = getMetadataLanguage(ci.getMetadata(), null);
+ if (language != null && !SUPPORTED_LANGUAGES.contains(language)) {
+ log.info(
+ "DBpedia Spotlight can not process ContentItem {} because "
+ + "language {} is not supported (supported: {})",
+ new Object[] { ci.getUri(), language,
+ SUPPORTED_LANGUAGES });
+ return CANNOT_ENHANCE;
+ }
+ return ENHANCE_SYNCHRONOUS;
+ }
+ return CANNOT_ENHANCE;
+ }
+
+ /**
+ * Calculate the enhancements by doing a POST request to the DBpedia
+ * Spotlight endpoint and processing the results
+ *
+ * @param ci
+ * the {@link ContentItem}
+ */
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci,
+ SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException(
+ "No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE
+ + "' found for ContentItem "
+ + ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+
+ Collection<SurfaceForm> dbpslGraph = doPostRequest(text);
+ if (dbpslGraph != null) {
+ // Acquire a write lock on the ContentItem when adding the
+ // enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ createEnhancements(dbpslGraph, ci);
+ if (log.isDebugEnabled()) {
+ Serializer serializer = Serializer.getInstance();
+ ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+ serializer.serialize(debugStream, ci.getMetadata(),
+ "application/rdf+xml");
+ try {
+ log.debug("DBpedia Spotlight Spot Enhancements:\n{}",
+ debugStream.toString("UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+ }
+
+ /**
+ * The method adds the returned DBpedia Spotlight surface forms to the
+ * content item's metadata. For each one an TextAnnotation is created.
+ *
+ * @param occs
+ * a Collection of entity information
+ * @param ci
+ * the content item
+ */
+ public void createEnhancements(Collection<SurfaceForm> occs,
+ ContentItem ci) {
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+ final Language language; // used for plain literals representing parts
+ // fo the content
+ String langString = getMetadataLanguage(ci.getMetadata(), null);
+
+ if (langString != null && !langString.isEmpty()) {
+ language = new Language(langString);
+ } else {
+ language = null;
+ }
+
+ HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>();
+
+ for (SurfaceForm occ : occs) {
+ UriRef textAnnotation = EnhancementEngineHelper
+ .createTextEnhancement(ci, this);
+ MGraph model = ci.getMetadata();
+
+ model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(occ.name, language)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_START,
+ literalFactory.createTypedLiteral(occ.offset)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_END,
+ literalFactory.createTypedLiteral(occ.offset
+ + occ.name.length())));
+ model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef(
+ occ.type)));
+ // TODO ################## model.add(new TripleImpl(textAnnotation,
+ // ENHANCER_SELECTION_CONTEXT, new
+ // PlainLiteralImpl(occ.context,language)));
+
+ if (entityAnnotationMap.containsKey(occ.name)) {
+ model.add(new TripleImpl(entityAnnotationMap.get(occ.name),
+ DC_RELATION, textAnnotation));
+ } else {
+ entityAnnotationMap.put(occ.name, textAnnotation);
+ }
+ }
+ }
+
+ /**
+ * Sends a POST request to the DBpediaSpotlight url.
+ *
+ * @param text
+ * a <code>String</code> with the text to be analyzed
+ * @return a <code>String</code> with the server response
+ * @throws EngineException
+ * if the request cannot be sent
+ */
+ public Collection<SurfaceForm> doPostRequest(String text)
+ throws EngineException {
+ StringBuilder data = new StringBuilder();
+ try {
+ if (spotlightSpotter != null && !spotlightSpotter.isEmpty())
+ data.append(URLEncoder.encode("spotter", "UTF-8") + "="
+ + URLEncoder.encode(spotlightSpotter, "UTF-8") + "&");
+ data.append(URLEncoder.encode("text", "UTF-8") + "="
+ + URLEncoder.encode(text, "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ throw new EngineException(
+ "Data for the httprequest could not be converted. Error: "
+ + e.getMessage());
+ }
+
+ HttpURLConnection connection = null;
+ StringBuffer response = new StringBuffer();
+
+ try {
+ // Create connection
+ URL url = new URL(spotlightUrl);
+ connection = (HttpURLConnection) url.openConnection();
+ connection.setRequestMethod("POST");
+ connection.setRequestProperty("Content-Type",
+ "application/x-www-form-urlencoded");
+ connection.setRequestProperty("Accept", "text/xml");
+
+ connection.setUseCaches(false);
+ connection.setDoInput(true);
+ connection.setDoOutput(true);
+
+ // Send request
+ DataOutputStream wr = new DataOutputStream(
+ connection.getOutputStream());
+ wr.writeBytes(data.toString());
+ wr.flush();
+ wr.close();
+
+ // Get Response
+ InputStream is = connection.getInputStream();
+ BufferedReader rd = new BufferedReader(new InputStreamReader(is));
+ String line;
+ while ((line = rd.readLine()) != null) {
+ response.append(line);
+ response.append('\r');
+ }
+ rd.close();
+
+ } catch (Exception e) {
+
+ log.error("[request] Request could not be made. Error: "
+ + e.getMessage());
+ e.printStackTrace();
+ return null;
+
+ } finally {
+
+ if (connection != null) {
+ connection.disconnect();
+ }
+ }
+
+ XMLParser xmlParser = new XMLParser();
+ try {
+ Document xmlDoc = xmlParser.loadXMLFromString(response.toString());
+ NodeList nlist = xmlParser.getElementsByTagName(xmlDoc,
+ "surfaceForm");
+ Collection<SurfaceForm> annos = this.getAnnotations(nlist);
+
+ return annos;
+ } catch (Exception e) {
+ log.error("[response] Response XML could not be parsed. Error: "
+ + e.getMessage());
+ throw new EngineException(
+ "Response XML could not be parsed. Error: "
+ + e.getMessage());
+ }
+ }
+
+ /**
+ * This method creates the Collection of surface forms, which the method
+ * <code>createEnhancement</code> adds to the meta data of the content item
+ * as TextAnnotations.
+ *
+ * @param nList
+ * NodeList of all Resources contained in the XML response from
+ * DBpedia Spotlight
+ * @return a Collection<DBPSLSurfaceForm> with all annotations
+ */
+ private Collection<SurfaceForm> getAnnotations(NodeList nList) {
+ Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ SurfaceForm dbpslann = new SurfaceForm();
+ Element node = (Element) nList.item(temp);
+ dbpslann.name = node.getAttribute("name");
+ dbpslann.offset = (new Integer(node.getAttribute("offset")))
+ .intValue();
+ dbpslann.type = node.getAttribute("type");
+
+ dbpslAnnos.add(dbpslann);
+ }
+
+ return dbpslAnnos;
+ }
+
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(
+ ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+ public String getMetadataLanguage(MGraph model, NonLiteral subj) {
+ Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource langNode = it.next().getObject();
+ return getLexicalForm(langNode);
+ }
+ return null;
+ }
+
+ public String getLexicalForm(Resource res) {
+ if (res == null) {
+ return null;
+ } else if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+ /**
+ * This method is used by the test class to set the endpoint url
+ *
+ * @param url
+ * String the url of the Spotlight endpoint
+ */
+ public void setEndpointUrl(String url) {
+ spotlightUrl = url;
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.spot;
+
+//import org.apache.clerezza.rdf.core.Resource;
+
+/**
+ * Stores the surface forms given by DBPedia Spotlight Spot.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public class SurfaceForm {
+
+ public String name;
+ public String type;
+ public Integer offset;
+
+ public String toString() {
+ return String.format("[name=%s, offset=%i, type=%s]", name, offset,
+ type);
+ }
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.spot;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+/**
+ * Parses the XML results given by DBPedia Spotlight.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+
+public class XMLParser {
+
+ public NodeList getElementsByTagName(Document doc, String tagName) {
+
+ return doc.getElementsByTagName(tagName);
+ }
+
+ public Document loadXMLFromString(String xml) throws SAXException,
+ IOException {
+ Document doc = loadXMLFromInputStream(new ByteArrayInputStream(
+ xml.getBytes()));
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public Document loadXMLFromInputStream(InputStream is) throws SAXException,
+ IOException {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setNamespaceAware(true);
+ DocumentBuilder builder = null;
+ try {
+ builder = factory.newDocumentBuilder();
+ } catch (ParserConfigurationException ex) {
+ }
+ Document doc = builder.parse(is);
+ is.close();
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+ public Document loadXMLFromFile(String filePath)
+ throws ParserConfigurationException, SAXException, IOException {
+ File fXmlFile = new File(filePath);
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(fXmlFile);
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+}
\ No newline at end of file
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java?rev=1375110&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java Mon Aug 20 17:14:56 2012
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlight.spot.core;
+
+import java.util.Collection;
+
+import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine;
+import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.SurfaceForm;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides a JUnit test for DBpedia Spotlight Spot
+ * EnhancementEngine.
+ *
+ * @author Iavor Jelev, babelmonkeys / GzEvD
+ */
+public class DBPSpotlightSpotEnhancementTest {
+
+ /**
+ * This contains the logger.
+ */
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DBPSpotlightSpotEnhancementTest.class);
+ private static String SPL_URL = System
+ .getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/spot"
+ : (String) System
+ .getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY);
+ private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday";
+ private static DBPSpotlightSpotEnhancementEngine dbpslight;
+
+ @BeforeClass
+ public static void oneTimeSetup() throws ConfigurationException {
+ dbpslight = new DBPSpotlightSpotEnhancementEngine();
+ dbpslight.setEndpointUrl(SPL_URL);
+ }
+
+ @Test
+ public void testEntityExtraction() {
+ Collection<SurfaceForm> entities;
+ try {
+ entities = dbpslight.doPostRequest(TEST_TEXT);
+ LOG.info("Found entities: {}", entities.size());
+ LOG.debug("Entities:\n{}", entities);
+ Assert.assertFalse("No entities were found!", entities.isEmpty());
+ } catch (EngineException e) {
+ Assert.assertFalse("An EngineException occurred! The message was: "
+ + e.getMessage(), true);
+ }
+ }
+
+}
Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml?rev=1375110&r1=1375109&r2=1375110&view=diff
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml (original)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml Mon Aug 20 17:14:56 2012
@@ -57,9 +57,9 @@
<module>opencalais</module> <!-- http://opencalais.com/ -->
<module>zemanta</module> <!-- htt://zemanta.com -->
<!-- DBpedia.org Spotlight Enhancement Engines (STANBOL-706) -->
- <module>dbpspotlightannotate</module>
- <module>dbpspotlightcandidates</module>
- <module>dbpspotlightdisambiguate</module>
- <module>dbpspotlightspot</module>
+ <module>dbpedia-spotlight-annotate</module>
+ <module>dbpedia-spotlight-candidates</module>
+ <module>dbpedia-spotlight-disambiguate</module>
+ <module>dbpedia-spotlight-spot</module>
</modules>
</project>