You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/05/19 12:27:16 UTC

svn commit: r1340409 - in /incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src: main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/ test/java/org/apache/s...

Author: rwesten
Date: Sat May 19 10:27:15 2012
New Revision: 1340409

URL: http://svn.apache.org/viewvc?rev=1340409&view=rev
Log:
STANBOL-583: Mainly adaptions of the CELI classification engine

* Now fise:TopicEnhancements are created as defined by STANBOL-617 for that I needed to change mappings from the CELI results (see notes in SOAP result processing part of the HttpClient) 
* Adapted UnitTest to check those
* HttpClient adaptions similar to the other engines
* Improved Errorhandling of the classification engine
* classification engine now uses a write lock while writing classification results


NOTE: I added extensive NOTES to changes performed to the classification engine. A lot of those notes would be similar for all CELI engines.

Modified:
    incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
    incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
    incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
    incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
    incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java

Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java Sat May 19 10:27:15 2012
@@ -1,7 +1,13 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.createTextEnhancement;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.SKOS_CONCEPT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
 
 import java.io.IOException;
 import java.net.URL;
@@ -13,11 +19,14 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.Vector;
 
+import javax.xml.soap.SOAPException;
+
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NoConvertorException;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -37,6 +46,8 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
@@ -52,7 +63,8 @@ public class CeliClassificationEnhanceme
 	 * This ensures that no connections to external services are made if Stanbol is started in offline mode 
 	 * as the OnlineMode service will only be available if OfflineMode is deactivated. 
 	 */
-	@Reference
+	@SuppressWarnings("unused") //it's not unused!
+    @Reference
     private OnlineMode onlineMode; 
 	
 	private static List<String> supportedLangs = new Vector<String>();
@@ -66,6 +78,10 @@ public class CeliClassificationEnhanceme
 		supportedLangs.add("pl");
 		supportedLangs.add("nl");
 	}
+	/**
+	 * The literal factory used to create types literals
+	 */
+    private LiteralFactory literalFactory = LiteralFactory.getInstance();
 
 	/**
 	 * The literal representing the LangIDEngine as creator.
@@ -77,10 +93,16 @@ public class CeliClassificationEnhanceme
 	 * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
 	 */
 	public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
+	/**
+	 * Currently used as fise:entity-type for TopicAnnotations
+	 */
+	private static final UriRef OWL_CLASS = new UriRef("http://www.w3.org/2002/07/owl#Class");
+	
 	private Logger log = LoggerFactory.getLogger(getClass());
 
-	private String language = null;
+	//NOTE: one CAN NOT store the language as member, as EnhancementEngines
+	//      can be called in parallel by multiple threads!
+	//private String language = null;
 
 	/**
 	 * This contains the only MIME type directly supported by this enhancement
@@ -108,7 +130,8 @@ public class CeliClassificationEnhanceme
 	@Activate
 	protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
 		super.activate(ctx);
-		Dictionary<String, Object> properties = ctx.getProperties();
+		@SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ctx.getProperties();
 		this.licenseKey = (String) properties.get(LICENSE_KEY);
 		if (licenseKey == null || licenseKey.isEmpty()) {
 			log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
@@ -129,63 +152,114 @@ public class CeliClassificationEnhanceme
 
 	@Override
 	public int canEnhance(ContentItem ci) throws EngineException {
-		this.language = EnhancementEngineHelper.getLanguage(ci);
-		if (language == null) {
-			throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
-		}
-
-		if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
+		String language = EnhancementEngineHelper.getLanguage(ci);
+		//canEnhance should inform if it can not enhance a ContentItem because
+		//of an potential error in the EnhancementChain configuration, but not
+		//throw runtime exceptions.
+//		if (language == null) {
+//			throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+//		}
+        if(language==null) {
+            log.warn("Unable to enhance ContentItem {} because language of the Content is unknown." +
+                    " Please check that a language identification engine is active in this EnhancementChain.",
+                    ci.getUri());
+        }
+
+		if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language)) {
+		    //NOTE: ENHANCE_ASYNC indicates that the computeEnhancements Method
+		    //      correctly applies read/write locks to the contentItem
 			return ENHANCE_ASYNC;
-		else
+		} else {
 			return CANNOT_ENHANCE;
+		}
 	}
 
 
 	@Override
 	public void computeEnhancements(ContentItem ci) throws EngineException {
-		if (this.language == null)
-			this.language = EnhancementEngineHelper.getLanguage(ci);
-
+	    //NOTE: in the computeEnhancements Method on can check metadata already
+	    //      checked within the canEnhance method. THis is not required, but it
+	    //      may help to identify potential bugs in the EnhancementJobManager
+	    //      implementation
+        String language = EnhancementEngineHelper.getLanguage(ci);
+        if (!isLangSupported(language)){
+            throw new IllegalStateException("Call to computeEnhancement with unsupported language '"
+                    +language+" for ContentItem "+ ci.getUri() +": This is also checked "
+                    + "in the canEnhance method! -> This indicated an Bug in the "
+                    + "implementation of the " + "EnhancementJobManager!");
+        }
 		Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
 		if (contentPart == null) {
-			throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
-					+ "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+			throw new IllegalStateException("No ContentPart with Mimetype '" 
+			        + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " 
+			        + ci.getUri() + ": This is also checked in the canEnhance "
+			        + "method! -> This indicates an Bug in the implementation of "
+			        + "the EnhancementJobManager!");
 		}
-		String text = "";
+		String text;
 		try {
 			text = ContentItemHelper.getText(contentPart.getValue());
 		} catch (IOException e) {
 			throw new InvalidContentException(this, ci, e);
 		}
 		if (text.trim().length() == 0) {
-			log.info("No text contained in ContentPart {"+contentPart.getKey()+"} of ContentItem {"+ci.getUri()+"}");
+			log.info("No text contained in ContentPart {} of ContentItem {}",
+			    contentPart.getKey(),ci.getUri());
 			return;
 		}
-
+		//NOTE: EnhancementEngine implementations should pass all Exceptions 
+		//      (RuntimeExceptions as is and others wrapped as EngineExceptions). 
+		//      The EnhancementJobManager implementation has to catch and
+		//      process all those. Handling depends on the configuration of the
+		//      EnhancementChain (e.g. if this engine is optional enhancement of
+		//      the ContentItem will continue).
+		//      This is important as otherwise Users would get "200 ok" replies
+		//      for failed enhancement requests that have failed!
+		//
+		//      This means that:
+		//      * Http clients should pass on IOExceptions and SOAPExceptions
+		//      * No try/catch that also includes RuntimeExceptions
+		List<Concept> lista;
 		try {
-			
-			List<Concept> lista = this.client.extractConcepts(text, language);
-			LiteralFactory literalFactory = LiteralFactory.getInstance();
-
-			MGraph g = ci.getMetadata();
-
-			UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
-
-			for (Concept ne : lista) {
-				List<UriRef> uris = this.getEntityRefForType(ne.getClassLabel());
-
-				try {
-					for (UriRef uri : uris)
-						g.add(new TripleImpl(textAnnotation, DC_RELATION, uri));
-					g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(ne.getConfidence())));
-				} catch (NoConvertorException e) {
-					log.error(e.getMessage(),e);
-				}
-			}
-		} catch (Exception e) {
-			log.error(e.getMessage(),e);
+			lista = this.client.extractConcepts(text, language);
+        } catch (IOException e) { //re-throw exceptions as EngineException
+            throw new EngineException("Error while calling the CELI classification"
+                +" service (configured URL: " +serviceURL+")!",e);
+        } catch (SOAPException e) {
+            throw new EngineException("Error wile encoding/decoding the request/"
+                +"response to the CELI classification service!",e);
+        } 
+		if(lista.isEmpty()){ //not topics found
+		    return; //nothing to do
+		}
+		MGraph g = ci.getMetadata();
+		//NOTE: EnhancementEngines that use "ENHANCE_ASYNC" need to acquire a
+		//      writeLock before modifications to the enhancement metadata
+		ci.getLock().writeLock().lock();
+		try {
+    		//see STANBOL-617 for rules how to encode extracted topics
+    		//we need a single TextAnnotation to link all TopicAnnotations
+    		UriRef textAnnotation = createTextEnhancement(ci, this);
+    		// add the dc:type skos:Concept
+    		g.add(new TripleImpl(textAnnotation, DC_TYPE, SKOS_CONCEPT));
+    		
+    		//not create the fise:TopicAnnotations
+    		for (Concept ne : lista) {
+    		    UriRef topicAnnotation = EnhancementEngineHelper.createTopicEnhancement(ci, this);
+    	        g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_REFERENCE, ne.getUri()));
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_LABEL, 
+                    new PlainLiteralImpl(ne.getLabel())));
+                //TODO: currently I use owl:class as entity-type, because that is
+                //      what the linked dbpedia ontology resources are.
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_TYPE, OWL_CLASS));
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_CONFIDENCE, 
+                    literalFactory.createTypedLiteral(ne.getConfidence())));
+                //link to the TextAnnotation
+                g.add(new TripleImpl(topicAnnotation, DC_RELATION, textAnnotation));
+    		}
+		} finally {
+		    ci.getLock().writeLock().unlock();
 		}
-
 	}
 
 	private boolean isLangSupported(String language) {
@@ -195,14 +269,6 @@ public class CeliClassificationEnhanceme
 			return false;
 	}
 
-	private List<UriRef> getEntityRefForType(String classificationLabels) {
-		List<UriRef> refs = new Vector<UriRef>();
-		String[] tmps = classificationLabels.split(" ");
-		for (String dbPediaLabel : tmps) {
-			refs.add(new UriRef(NamespaceEnum.dbpedia_ont + dbPediaLabel));
-		}
-		return refs;
-	}
 
 	@Override
 	public Map<String, Object> getServiceProperties() {

Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java Sat May 19 10:27:15 2012
@@ -1,23 +1,35 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import java.io.BufferedWriter;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
 import java.net.HttpURLConnection;
 import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Vector;
 
 import javax.xml.soap.MessageFactory;
 import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPException;
 import javax.xml.soap.SOAPMessage;
 import javax.xml.soap.SOAPPart;
 import javax.xml.transform.stream.StreamSource;
 
+import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.util.Base64;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Element;
@@ -25,108 +37,258 @@ import org.w3c.dom.NodeList;
 
 public class ClassificationClientHTTP {
 	
-	private final Logger log = LoggerFactory.getLogger(getClass());
-	
+	private final static Logger log = LoggerFactory.getLogger(ClassificationClientHTTP.class);
+	//NOTE: Defining charset, content-type and SOAP prefix/suffix as
+	//      constants does make more easy to configure those things
+    /**
+     * The UTF-8 {@link Charset}
+     */
+    private static final Charset UTF8 = Charset.forName("UTF-8");
+    /**
+     * The content type "text/xml; charset={@link #UTF8}"
+     */
+    private static final String CONTENT_TYPE = "text/xml; charset="+UTF8.name();
+    /**
+     * The XML version, encoding; SOAP envelope, heder and starting element of the body;
+     * processTextRequest and text starting element.
+     */
+    private static final String SOAP_PREFIX = "<?xml version=\"1.0\" encoding=\""+UTF8.name()+"\"?>" 
+            + "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" "
+            + "xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body>";
+    /**
+     * closes the text, processTextRequest, SOAP body and envelope
+     */
+    private static final String SOAP_SUFFIX = "</soapenv:Body></soapenv:Envelope>";
+    
+    //TODO: This should be configurable
 	private static final int maxResultToReturn = 3;
 	
-	private URL serviceEP;
-	private String licenseKey;
+	private final URL serviceEP;
+	private final String licenseKey;
+	
+	//NOTE: the request headers are the same for all request - so they can be
+	//      initialized in the constructor.
+	private final Map<String,String> requestHeaders;
 	
 	
 	public ClassificationClientHTTP(URL serviceUrl, String licenseKey){
 		this.serviceEP=serviceUrl;
 		this.licenseKey=licenseKey;
+        Map<String,String> headers = new HashMap<String,String>();
+        headers.put("Content-Type", CONTENT_TYPE);
+        if(licenseKey != null){
+            String encoded = Base64.encode(this.licenseKey.getBytes(UTF8));
+            headers.put("Authorization", "Basic "+encoded);
+        }
+        this.requestHeaders = Collections.unmodifiableMap(headers);
 	}
 	
-	
-	public String doPostRequest(URL url, String body) throws IOException {
-		
-		HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
-		urlConn.setRequestMethod("POST");
-		urlConn.setDoInput(true);
-		if (null != body) {
-			urlConn.setDoOutput(true);
-		} else {
-			urlConn.setDoOutput(false);
-		}
-		urlConn.setUseCaches(false);
-		String	contentType = "text/xml; charset=utf-8";
-		urlConn.setRequestProperty("Content-Type", contentType);
-		if(this.licenseKey!=null){
-			String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
-			urlConn.setRequestProperty("Authorization", "Basic "+encoded);
-		}
-		
-		// send POST output
-		if (null != body) {
-			OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
-			printout.write(body);
-			printout.flush();
-			printout.close();
-		}
-		
-		//close connection
-		urlConn.disconnect();
-		
-		// get response data
-		return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
-	}
-
-
-	public List<Concept> extractConcepts(String text,String lang) {
-		List<Concept> extractedConcepts = new Vector<Concept>();
+	/*
+	 * NOTE: parsing/returning a String requires to create in-memory copies
+	 *       of the sent/received data. Imaging users that send the text of
+	 *       100 pages PDF files to the Stanbol Enhancer.
+	 *       Because of that an implementation that directly streams the
+	 *       StringEscapeUtils.escapeXml(..) to the request is preferable 
+	 *       
+	 *       This will no longer allow to debug the data of the request and
+	 *       response. See the commented main method at the end for alternatives
+	 */
+//	public String doPostRequest(URL url, String body) throws IOException {
+//		
+//		HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
+//		urlConn.setRequestMethod("POST");
+//		urlConn.setDoInput(true);
+//		if (null != body) {
+//			urlConn.setDoOutput(true);
+//		} else {
+//			urlConn.setDoOutput(false);
+//		}
+//		urlConn.setUseCaches(false);
+//		String	contentType = "text/xml; charset=utf-8";
+//		urlConn.setRequestProperty("Content-Type", contentType);
+//		if(this.licenseKey!=null){
+//			String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
+//			urlConn.setRequestProperty("Authorization", "Basic "+encoded);
+//		}
+//		
+//		// send POST output
+//		if (null != body) {
+//			OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+//			printout.write(body);
+//			printout.flush();
+//			printout.close();
+//		}
+//		
+//		//close connection
+//		urlConn.disconnect();
+//		
+//		// get response data
+//		return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
+//	}
+
+
+	//NOTE: forward IOException and SOAPExceptions to allow correct error handling
+	//      by the EnhancementJobManager.
+	//      Also RuntimeExceptions MUST NOT be cached out of the same reason!
+	public List<Concept> extractConcepts(String text,String lang) throws IOException, SOAPException {
+        if(text == null || text.isEmpty()){
+            //no text -> no classification
+            return Collections.emptyList();
+        }
+
+        //create the POST request
+        HttpURLConnection con = Utils.createPostRequest(serviceEP, requestHeaders);
+        //"stream" the request content directly to the buffered writer
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(con.getOutputStream(),UTF8));
+        writer.write(SOAP_PREFIX);
+        writer.write("<clas:classify>");
+        writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be configurable?
+        writer.write("<clas:model>");
+        writer.write(lang);
+        writer.write("</clas:model>");
+        writer.write("<clas:text>");
+        StringEscapeUtils.escapeXml(writer, text); //write the escaped text directly to the request
+        writer.write("</clas:text>");
+        writer.write("</clas:classify>");
+        writer.write(SOAP_SUFFIX);
+        writer.close();
+
+        //Call the service
+        long start = System.currentTimeMillis();
+        InputStream stream = con.getInputStream();
+        log.debug("Request to {} took {}ms",serviceEP,System.currentTimeMillis()-start);
+
+        //NOTE: forward IOException and SOAPExceptions to allow correct error handling
+        //      by the EnhancementJobManager.
+        //      Also RuntimeExceptions MUST NOT be cached out of the same reason!
+
+//		try {
+
+		// Create SoapMessage
+		MessageFactory msgFactory = MessageFactory.newInstance();
+		SOAPMessage message = msgFactory.createMessage();
+		SOAPPart soapPart = message.getSOAPPart();
+
+		// NOTE: directly use the InputStream provided by the URLConnection!
+//			ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+		StreamSource source = new StreamSource(stream);
+
+		// Set contents of message
+		soapPart.setContent(source);
+
+		SOAPBody soapBody = message.getSOAPBody();
+        List<Concept> extractedConcepts = new Vector<Concept>();
+		NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+		HashSet<String> inserted=new HashSet<String>();
+		for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
+		    //NOTE: do not catch RuntimeExceptions. Error handling is done by
+		    //      the EnhancementJobManager!
+//			try {
+			Element result = (Element) nlist.item(i);
+
+			//NOTE: (rwesten) implemented a mapping from the CELI classification
+			//      to the Stanbol fise:TopicEnhancements (STANBOL-617) that
+			//        * one fise:TopicAnnotation is generated per "model"
+			//        * the whole label string is used as fise:entity-label
+			//        * the uri of the most specific dbpedia ontology type (see
+			//          selectClassificationClass) is used as fise:entity-reference
+			//      This has the intuition that for users it is easier to grasp
+			//      the meaning of the whole lable, while for machines the link
+			//      to the most specific dbpedia ontology class is best suited.
+			String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
+			model=model.substring(1, model.length()-1);
+			UriRef modelConcept = selectClassificationClass(model);
+			String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
+			Double confidence= new Double(conf);
+			extractedConcepts.add(new Concept(model,modelConcept,confidence));
+//			} catch (Exception e) {
+//				e.printStackTrace();
+//			}
 
-		try {
-			String txt = StringEscapeUtils.escapeXml(text);
-			String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body> <clas:classify>"
-							+"<clas:user>wiki</clas:user><clas:model>"+lang+"</clas:model><clas:text>"+txt+"</clas:text></clas:classify></soapenv:Body></soapenv:Envelope>";
-
-			
-			String responseXml = doPostRequest(this.serviceEP, xmldata);
-			log.debug(responseXml);
-
-			// Create SoapMessage
-			MessageFactory msgFactory = MessageFactory.newInstance();
-			SOAPMessage message = msgFactory.createMessage();
-			SOAPPart soapPart = message.getSOAPPart();
-
-			// Load the SOAP text into a stream source
-			ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
-			StreamSource source = new StreamSource(stream);
-
-			// Set contents of message
-			soapPart.setContent(source);
-
-			SOAPBody soapBody = message.getSOAPBody();
-			NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
-			HashSet<String> inserted=new HashSet<String>();
-			for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
-				try {
-					Element result = (Element) nlist.item(i);
-
-					String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
-					model=model.substring(1, model.length()-1);
-					String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
-					float confidence=Float.parseFloat(conf);
-					
-					String[] tmps=model.split(" ");
-					
-					for(String t: tmps){
-						if(!inserted.contains(t)){
-							extractedConcepts.add(new Concept(t, confidence));
-							inserted.add(t);
-						}
-					}
-				} catch (Exception e) {
-					e.printStackTrace();
-				}
-
-			}
-		} catch (Exception e) {
-			e.printStackTrace();
 		}
-		
+//		} catch (Exception e) {
+//			e.printStackTrace();
+//		}
 		return extractedConcepts;
 	}
-
+    /**
+     * TopicClassifications require only a single fise:entity-reference.
+     * However the CELI classification service delivers <p>
+     * <code><pre>
+     *     <ns2:label>[Organisation HockeyTeam SportsTeam]</ns2:label>
+     * </pre></code>
+     * because of that this method needs to select one of the labels.<p>
+     * This method currently selects the 2nd token if there are more than one
+     * concept suggestions included. NOTE that the whole literal is used as
+     * fise:entity-label!
+     * @param classificationLabels the label string
+     * @return the selected label
+     */
+    private UriRef selectClassificationClass(String classificationLabels) {
+        //NOTE: (rwesten) In general it would be better if CELI could provide
+        //      de-referenceable URLs for those suggestions.
+        //      If that is possible one would no longer need to link to the
+        //      most specific dbpedia ontology class for a category e.g.
+        //          http://dbpedia.org/ontology/HockeyTeam
+        //      for
+        //          [Organisation HockeyTeam SportsTeam]
+        //      but e.g.
+        //          http://linguagrid.org/category/HockeyTeam
+        //      meaning the linguagrid could provide categories as skos thesaurus
+        //      via it's web interface
+        int start = classificationLabels.charAt(0) == '[' ? 1 : 0;
+        int end = classificationLabels.charAt(classificationLabels.length()-1) == ']' ?
+                classificationLabels.length() - 1 : classificationLabels.length();
+        String[] tmps = classificationLabels.substring(start, end).split(" ");
+        return new UriRef(NamespaceEnum.dbpedia_ont.getNamespace()+ //the namespace
+            (tmps.length > 1 ? tmps[1] : tmps[0])); //the Class for the label
+    }	
+	
+	//NOTE: If you stream the contents directly to the stream, you can no longer
+	//      debug the request/response. Because of that it is sometimes
+	//      helpful to have a main method for those tests
+	//      An even better variant would be to write a UnitTest for that!!
+	//      This would be recommended of the called service is still in beta
+	//      and may change at any time
+//    public static void main(String[] args) throws Exception {
+//        String lang = "fr";
+//        String text = "Brigitte Bardot, née  le 28 septembre " +
+//                "1934 à Paris, est une actrice de cinéma et chanteuse française.";
+//        
+//        //For request testing
+//        //Writer request = new StringWriter();
+//        
+//        //For response testing
+//        HttpURLConnection con = Utils.createPostRequest(
+//            new URL("http://linguagrid.org/LSGrid/ws/dbpedia-classification"),
+//            Collections.singletonMap("Content-Type", CONTENT_TYPE));
+//        Writer request = new OutputStreamWriter(con.getOutputStream(),UTF8);
+//        
+//        //"stream" the request content directly to the buffered writer
+//        BufferedWriter writer = new BufferedWriter(request);
+//        
+//        writer.write(SOAP_PREFIX);
+//        writer.write("<clas:classify>");
+//        writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be configurable?
+//        writer.write("<clas:model>");
+//        writer.write(lang);
+//        writer.write("</clas:model>");
+//        writer.write("<clas:text>");
+//        StringEscapeUtils.escapeXml(writer, text); //write the escaped text directly to the request
+//        writer.write("</clas:text>");
+//        writer.write("</clas:classify>");
+//        writer.write(SOAP_SUFFIX);
+//        writer.close();
+//        
+//        //log the Request (if request testing)
+//        //log.info("Request \n{}",request.toString());
+//        
+//        //for response testing we need to call the service
+//        //Call the service
+//        long start = System.currentTimeMillis();
+//        InputStream stream = con.getInputStream();
+//        log.info("Request to took {}ms",System.currentTimeMillis()-start);
+//        log.info("Response:\n{}",IOUtils.toString(stream));
+//        stream.close();
+//    }
 }

Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java Sat May 19 10:27:15 2012
@@ -1,28 +1,34 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import org.apache.clerezza.rdf.core.UriRef;
+
 public class Concept {
 	
-	private String classLabel;
-	private float confidence;
+	private final String label;
+	private final UriRef uri;
+	private final Double confidence;
 	
-	public Concept(String classLabel, float confidence) {
+	public Concept(String label, UriRef uri,Double confidence) {
 		super();
-		this.classLabel = classLabel;
+		this.label = label;
+		this.uri = uri;
 		this.confidence = confidence;
 	}
 	
-	public String getClassLabel() {
-		return classLabel;
-	}
-	public void setClassLabel(String classLabel) {
-		this.classLabel = classLabel;
-	}
-	public float getConfidence() {
+
+	public Double getConfidence() {
 		return confidence;
 	}
-	public void setConfidence(float confidence) {
-		this.confidence = confidence;
-	}
+
+
+    public String getLabel() {
+        return label;
+    }
+
+
+    public UriRef getUri() {
+        return uri;
+    }
 	
 	
 }

Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java Sat May 19 10:27:15 2012
@@ -75,7 +75,6 @@ public class NERserviceClientHTTP {
 	        //no text -> no extractions
 	        return Collections.emptyList();
 	    }
-		List<NamedEntity> extractedNE = new Vector<NamedEntity>();
 
 	    //create the POST request
 		HttpURLConnection con = Utils.createPostRequest(serviceEP, requestHeaders);
@@ -102,6 +101,9 @@ public class NERserviceClientHTTP {
 		soapPart.setContent(source);
 
 		SOAPBody soapBody = message.getSOAPBody();
+		
+		//extract the results
+        List<NamedEntity> extractedNE = new Vector<NamedEntity>();
 		NodeList nlist = soapBody.getElementsByTagName("result");
 		for (int i = 0; i < nlist.getLength(); i++) {
 			Element result = (Element) nlist.item(i);

Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java Sat May 19 10:27:15 2012
@@ -1,18 +1,25 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import static junit.framework.Assert.assertEquals;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
+import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTopicAnnotations;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.net.UnknownHostException;
 import java.util.Dictionary;
+import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.Iterator;
 
+import junit.framework.Assert;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
@@ -25,6 +32,8 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -76,11 +85,15 @@ public class CeliClassificationEnhanceme
 			classificationEngine.computeEnhancements(ci);
 
 	        TestUtils.logEnhancements(ci);
-			
-			int textAnnoNum = checkAllTextAnnotations(ci.getMetadata(), TEXT);
-	        log.info(textAnnoNum + " TextAnnotations found ...");
-	        int entityAnnoNum = checkAllEntityAnnotations(ci.getMetadata());
-	        log.info(entityAnnoNum + " EntityAnnotations found ...");
+	         HashMap<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
+	            expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
+	            expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
+	                classificationEngine.getClass().getName()));
+
+			int textAnnoNum = EnhancementStructureHelper.validateAllTextAnnotations(ci.getMetadata(), TEXT,expectedValues);
+			assertEquals("Only a single fise:TextAnnotation is expeted", 1, textAnnoNum);
+			int numTopicAnnotations = validateAllTopicAnnotations(ci.getMetadata()  , expectedValues);
+			assertTrue("No TpocisAnnotations found", numTopicAnnotations > 0);
 		} catch (EngineException e) {
 			if (e.getCause() != null && e.getCause() instanceof UnknownHostException) {
 				log.warn("Celi Service not reachable -> offline? -> deactivate test");
@@ -90,25 +103,4 @@ public class CeliClassificationEnhanceme
 		}
 	}
 
-	private int checkAllEntityAnnotations(MGraph g) {
-		Iterator<Triple> entityAnnotationIterator = g.filter(null, RDF_TYPE, ENHANCER_ENTITYANNOTATION);
-		int entityAnnotationCount = 0;
-		while (entityAnnotationIterator.hasNext()) {
-			UriRef entityAnnotation = (UriRef) entityAnnotationIterator.next().getSubject();
-			entityAnnotationCount++;
-		}
-		return entityAnnotationCount;
-	}
-	
-	private int checkAllTextAnnotations(MGraph g, String content) {
-		Iterator<Triple> textAnnotationIterator = g.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
-		// test if a textAnnotation is present
-		assertTrue(textAnnotationIterator.hasNext());
-		int textAnnotationCount = 0;
-		while (textAnnotationIterator.hasNext()) {
-			UriRef textAnnotation = (UriRef) textAnnotationIterator.next().getSubject();
-			textAnnotationCount++;
-		}
-		return textAnnotationCount;
-	}
 }