You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/08/20 14:11:05 UTC
svn commit: r1374984 [3/3] - in
/incubator/stanbol/branches/dbpedia-spotlight-engines/engines:
dbpspotlightannotate/ dbpspotlightannotate/src/
dbpspotlightannotate/src/license/ dbpspotlightannotate/src/main/
dbpspotlightannotate/src/main/java/ dbpspotl...
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml Mon Aug 20 12:11:01 2012
@@ -0,0 +1,121 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <groupId>org.apache.stanbol</groupId>
+ <version>0.9.0-incubating</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlightspot</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight Spot</name>
+ <description>an enhancement engine for spotting</description>
+
+ <inceptionYear>2010</inceptionYear>
+
+ <!--scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/tags/0.9.0-incubating/enhancer/engines/langid/
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/tags/0.9.0-incubating/enhancer/engines/langid/
+ </developerConnection>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm-->
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>
+ org.apache.stanbol.enhancer.engines.dbpspotlightspot;version=${project.version}
+ </Export-Package>
+ <Embed-Dependency>
+ </Embed-Dependency>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <!-- AL20 licensed files: See src/test/resources/README -->
+ <exclude>src/test/resources/en.txt</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties Mon Aug 20 12:11:01 2012
@@ -0,0 +1,17 @@
+# Generated by org.codehaus.mojo.license.AddThirdPartyMojo
+#-------------------------------------------------------------------------------
+# Already used licenses in project :
+# - Apache License
+# - Common Development and Distribution License (CDDL) v1.0
+# - Common Public License Version 1.0
+# - ICU License
+# - MIT License
+# - The Apache Software License, Version 2.0
+#-------------------------------------------------------------------------------
+# Please fill the missing licenses for dependencies :
+#
+#
+#Wed Feb 15 19:06:13 CET 2012
+javax.servlet--servlet-api--2.4=Common Development And Distribution License (CDDL), Version 1.0
+org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
+org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java Mon Aug 20 12:11:01 2012
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlightspot;
+
+//import org.apache.clerezza.rdf.core.Resource;
+
+/**
+ * Stores the surface forms given by DBPedia Spotlight Spot.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+public class DBPSLSurfaceForm {
+
+ public String name;
+ public String type;
+ public Integer offset;
+
+ public String toString() {
+ return String.format( "[name=%s, offset=%i, type=%s]", name, offset, type ) ;
+ }
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java Mon Aug 20 12:11:01 2012
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlightspot;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.net.URLEncoder;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.clerezza.rdf.core.serializedform.Serializer;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * {@link DBPSpotlightSpotEnhancementEngine} provides functionality to enhance document
+ * with their language.
+ *
+ * @author Iavor Jelev, Babelmonkeys (GzEvD)
+ */
+@Component(
+ metatype = true,
+ immediate = true,
+ label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name",
+ description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description")
+@Service
+@Properties(value={
+ @Property(name=EnhancementEngine.PROPERTY_NAME,value="dbpspotlightspot")
+})
+public class DBPSpotlightSpotEnhancementEngine
+ extends AbstractEnhancementEngine<IOException,RuntimeException>
+ implements EnhancementEngine, ServiceProperties {
+
+ /**
+ * a configurable value of the text segment length to check
+ */
+ @Property(value = "http://spotlight.dbpedia.org/rest/spot")
+ public static final String SL_URL_KEY = "stanbol.DBPSpotlightSpotEnhancementEngine.url";
+
+ @Property(value = "LingPipeSpotter")
+ public static final String SL_SPOTTER = "stanbol.DBPSpotlightSpotEnhancementEngine.spotter";
+
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
+ */
+ public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 29;
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+
+ /**
+ * This contains a list of languages supported by DBpedia Spotlight.
+ * If the metadata doesn't contain a value for the language as the value of the {@link Property.DC_LANG property}
+ * the content can't be processed.
+ */
+ protected static final Set<String> SUPPORTED_LANGUAGES =
+ Collections.unmodifiableSet(new HashSet<String>(
+ Arrays.asList("en")));
+
+ /** holds the logger. */
+ private static final Logger log = LoggerFactory.getLogger(DBPSpotlightSpotEnhancementEngine.class);
+
+ /** holds the url of the Spotlight REST endpoint */
+ private String spotlightUrl;
+ /** holds the chosen of spotter to be used */
+ private String spotlightSpotter;
+
+
+
+ /**
+ * Initialize all parameters from the configuration panel, or with their default values
+ * @param ce the {@link ComponentContext}
+ */
+ @SuppressWarnings("unchecked")
+ protected void activate(ComponentContext ce) throws ConfigurationException, IOException {
+
+ super.activate(ce);
+
+ Dictionary<String, Object> properties = ce.getProperties();
+ spotlightUrl = properties.get( SL_URL_KEY ) == null ? "http://spotlight.dbpedia.org/rest/spot" : (String) properties.get( SL_URL_KEY );
+ spotlightSpotter = properties.get( SL_SPOTTER ) == null ? null : (String) properties.get( SL_SPOTTER );
+ }
+
+
+ /**
+ * Check if the content can be enhanced
+ * @param ci the {@link ContentItem}
+ */
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){
+ String language = getMetadataLanguage(ci.getMetadata(), null);
+ if (language != null && !SUPPORTED_LANGUAGES.contains(language)) {
+ log.info("DBpedia Spotlight can not process ContentItem {} because "
+ + "language {} is not supported (supported: {})",
+ new Object[]{ci.getUri(),language,SUPPORTED_LANGUAGES});
+ return CANNOT_ENHANCE;
+ }
+ return ENHANCE_SYNCHRONOUS;
+ }
+ return CANNOT_ENHANCE;
+ }
+
+
+ /**
+ * Calculate the enhancements by doing a POST request to the DBpedia Spotlight endpoint and processing the results
+ * @param ci the {@link ContentItem}
+ */
+ public void computeEnhancements( ContentItem ci ) throws EngineException {
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+
+ Collection<DBPSLSurfaceForm> dbpslGraph = doPostRequest( text );
+ if ( dbpslGraph != null ) {
+ //Acquire a write lock on the ContentItem when adding the enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ createEnhancements( dbpslGraph, ci);
+ if (log.isDebugEnabled()) {
+ Serializer serializer = Serializer.getInstance();
+ ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+ serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
+ try {
+ log.debug("DBpedia Spotlight Spot Enhancements:\n{}",debugStream.toString("UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+ }
+
+
+ /**
+ * The method adds the returned DBpedia Spotlight surface forms to the content item's metadata.
+ * For each one an TextAnnotation is created.
+ *
+ * @param occs a Collection of entity information
+ * @param ci the content item
+ */
+ public void createEnhancements( Collection<DBPSLSurfaceForm> occs, ContentItem ci ) {
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+ final Language language; // used for plain literals representing parts fo the content
+ String langString = getMetadataLanguage(ci.getMetadata(), null);
+
+ if(langString != null && !langString.isEmpty()){
+ language = new Language(langString);
+ } else {
+ language = null;
+ }
+
+ HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>();
+
+ for (DBPSLSurfaceForm occ : occs) {
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement( ci, this );
+ MGraph model = ci.getMetadata();
+
+ model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,new PlainLiteralImpl(occ.name,language)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
+ model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.name.length())));
+ model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef( occ.type )));
+ // TODO ################## model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context,language)));
+
+ if (entityAnnotationMap.containsKey(occ.name)) {
+ model.add(new TripleImpl(entityAnnotationMap.get(occ.name), DC_RELATION, textAnnotation));
+ }
+ else {
+ entityAnnotationMap.put(occ.name,textAnnotation);
+ }
+ }
+ }
+
+
+
+
+ /**
+ * Sends a POST request to the DBpediaSpotlight url.
+ * @param text a <code>String</code> with the text to be analyzed
+ * @return a <code>String</code> with the server response
+ * @throws EngineException if the request cannot be sent
+ */
+ public Collection<DBPSLSurfaceForm> doPostRequest( String text ) throws EngineException {
+ StringBuilder data = new StringBuilder();
+ try {
+ if ( spotlightSpotter != null && !spotlightSpotter.isEmpty() )
+ data.append( URLEncoder.encode( "spotter", "UTF-8" ) + "=" + URLEncoder.encode( spotlightSpotter, "UTF-8" ) + "&" );
+ data.append( URLEncoder.encode( "text", "UTF-8" ) + "=" + URLEncoder.encode( text, "UTF-8" ) );
+ } catch (UnsupportedEncodingException e) {
+ throw new EngineException( "Data for the httprequest could not be converted. Error: " + e.getMessage() );
+ }
+
+ HttpURLConnection connection = null;
+ StringBuffer response = new StringBuffer();
+
+ try {
+ //Create connection
+ URL url = new URL( spotlightUrl );
+ connection = ( HttpURLConnection )url.openConnection();
+ connection.setRequestMethod( "POST" );
+ connection.setRequestProperty( "Content-Type", "application/x-www-form-urlencoded" );
+ connection.setRequestProperty( "Accept", "text/xml" );
+
+ connection.setUseCaches( false );
+ connection.setDoInput( true );
+ connection.setDoOutput( true );
+
+ //Send request
+ DataOutputStream wr = new DataOutputStream (
+ connection.getOutputStream ());
+ wr.writeBytes( data.toString() );
+ wr.flush ();
+ wr.close ();
+
+ //Get Response
+ InputStream is = connection.getInputStream();
+ BufferedReader rd = new BufferedReader( new InputStreamReader( is ) );
+ String line;
+ while((line = rd.readLine()) != null) {
+ response.append( line );
+ response.append( '\r' );
+ }
+ rd.close();
+
+ } catch (Exception e) {
+
+ log.error( "[request] Request could not be made. Error: " + e.getMessage() );
+ e.printStackTrace();
+ return null;
+
+ } finally {
+
+ if(connection != null) {
+ connection.disconnect();
+ }
+ }
+
+
+ XMLParser xmlParser = new XMLParser();
+ try {
+ Document xmlDoc = xmlParser.loadXMLFromString( response.toString() );
+ NodeList nlist = xmlParser.getElementsByTagName( xmlDoc, "surfaceForm" );
+ Collection<DBPSLSurfaceForm> annos = this.getAnnotations( nlist );
+
+ return annos;
+ } catch ( Exception e) {
+ log.error( "[response] Response XML could not be parsed. Error: " + e.getMessage() );
+ throw new EngineException( "Response XML could not be parsed. Error: " + e.getMessage() );
+ }
+ }
+
+
+ /**
+ * This method creates the Collection of surface forms, which the method <code>createEnhancement</code>
+ * adds to the meta data of the content item as TextAnnotations.
+ * @param nList NodeList of all Resources contained in the XML response from DBpedia Spotlight
+ * @return a Collection<DBPSLSurfaceForm> with all annotations
+ */
+ private Collection<DBPSLSurfaceForm> getAnnotations( NodeList nList ) {
+ Collection<DBPSLSurfaceForm> dbpslAnnos = new HashSet<DBPSLSurfaceForm>();
+
+ for (int temp = 0; temp < nList.getLength(); temp++) {
+ DBPSLSurfaceForm dbpslann = new DBPSLSurfaceForm();
+ Element node = (Element) nList.item(temp);
+ dbpslann.name = node.getAttribute( "name" );
+ dbpslann.offset = (new Integer( node.getAttribute( "offset" ) ) ).intValue();
+ dbpslann.type = node.getAttribute( "type" );
+
+ dbpslAnnos.add( dbpslann );
+ }
+
+ return dbpslAnnos;
+ }
+
+
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+
+ public String getMetadataLanguage(MGraph model, NonLiteral subj) {
+ Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource langNode = it.next().getObject();
+ return getLexicalForm(langNode);
+ }
+ return null;
+ }
+
+ public String getLexicalForm(Resource res) {
+ if (res == null) {
+ return null;
+ } else if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+
+ /**
+ * This method is used by the test class to set the endpoint url
+ * @param url String the url of the Spotlight endpoint
+ */
+ public void setEndpointUrl( String url ) {
+ spotlightUrl = url;
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java Mon Aug 20 12:11:01 2012
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlightspot;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Parses the XML results given by DBPedia Spotlight.
+ *
+ * @author <a href="mailto:iavor.jelev@babelmonkeys.com">Iavor Jelev</a>
+ */
+
+public class XMLParser {
+
+ public NodeList getElementsByTagName( Document doc, String tagName ) {
+
+ return doc.getElementsByTagName( tagName );
+ }
+
+
+ public Document loadXMLFromString( String xml ) throws SAXException, IOException {
+ Document doc = loadXMLFromInputStream( new ByteArrayInputStream( xml.getBytes() ) );
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+
+ public Document loadXMLFromInputStream( InputStream is ) throws SAXException, IOException {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setNamespaceAware( true );
+ DocumentBuilder builder = null;
+ try {
+ builder = factory.newDocumentBuilder();
+ }
+ catch ( ParserConfigurationException ex ) {
+ }
+ Document doc = builder.parse(is);
+ is.close();
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+
+
+ public Document loadXMLFromFile( String filePath ) throws ParserConfigurationException, SAXException, IOException {
+ File fXmlFile = new File( filePath );
+ DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+ Document doc = dBuilder.parse(fXmlFile);
+ doc.getDocumentElement().normalize();
+
+ return doc;
+ }
+}
\ No newline at end of file
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Aug 20 12:11:01 2012
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+
+# This file contains localization strings for configuration labels and
+# descriptions as used in the metatype.xml descriptor generated by the
+# the maven SCR plugin
+
+stanbol.DBPSpotlightSpotEnhancementEngine.name = DBpedia Spotlight Spotter: Named Entity Recognition
+stanbol.DBPSpotlightSpotEnhancementEngine.description = This engine performs just Named Entity Recognition, \
+ so it is suited for EnhancementChain scenario, in which another Engine links the recognized TextAnnotations \
+ to Ontology Types
+stanbol.DBPSpotlightSpotEnhancementEngine.url.name = Spotlight URL
+stanbol.DBPSpotlightSpotEnhancementEngine.url.description = The URL which will be used for the request
+stanbol.DBPSpotlightSpotEnhancementEngine.spotter.name = Spotter
+stanbol.DBPSpotlightSpotEnhancementEngine.spotter.description = The algorithm which will be used for Spotting \
+ (aka Term Recognition). Currently available: NER, LingPipeSpotter, OpenNLPChunkerSpotter, Kea
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java Mon Aug 20 12:11:01 2012
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.dbpspotlightspot.core;
+
+import java.util.Collection;
+
+import org.apache.stanbol.enhancer.engines.dbpspotlightspot.DBPSLSurfaceForm;
+import org.apache.stanbol.enhancer.engines.dbpspotlightspot.DBPSpotlightSpotEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class provides a JUnit test for DBpedia Spotlight Spot EnhancementEngine.
+ * @author Iavor Jelev, babelmonkeys / GzEvD
+ */
+public class DBPSpotlightSpotEnhancementTest {
+
+ /**
+ * This contains the logger.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(DBPSpotlightSpotEnhancementTest.class);
+ private static String SPL_URL = System.getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY) == null ?
+ "http://spotlight.dbpedia.org/rest/spot" : (String) System.getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY);
+ private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday";
+ private static DBPSpotlightSpotEnhancementEngine dbpslight;
+
+ @BeforeClass
+ public static void oneTimeSetup() throws ConfigurationException {
+ dbpslight = new DBPSpotlightSpotEnhancementEngine();
+ dbpslight.setEndpointUrl( SPL_URL );
+ }
+
+
+ @Test
+ public void testEntityExtraction() {
+ Collection<DBPSLSurfaceForm> entities;
+ try {
+ entities = dbpslight.doPostRequest( TEST_TEXT );
+ LOG.info("Found entities: {}",entities.size());
+ LOG.debug("Entities:\n{}",entities);
+ Assert.assertFalse("No entities were found!", entities.isEmpty());
+ } catch (EngineException e) {
+ Assert.assertFalse("An EngineException occurred! The message was: " + e.getMessage(), true);
+ }
+ }
+
+}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README?rev=1374984&view=auto
==============================================================================
--- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README (added)
+++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README Mon Aug 20 12:11:01 2012
@@ -0,0 +1,15 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+