You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/05/19 12:27:16 UTC
svn commit: r1340409 - in
/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src:
main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/
main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/
test/java/org/apache/s...
Author: rwesten
Date: Sat May 19 10:27:15 2012
New Revision: 1340409
URL: http://svn.apache.org/viewvc?rev=1340409&view=rev
Log:
STANBOL-583: Mainly adaptions of the CELI classification engine
* Now fise:TopicEnhancements are created as defined by STANBOL-617 for that I needed to change mappings from the CELI results (see notes in SOAP result processing part of the HttpClient)
* Adapted UnitTest to check those
* HttpClient adaptions similar to the other engines
* Improved Errorhandling of the classification engine
* classification engine now uses a write lock while writing classification results
NOTE: I added extensive NOTES to changes performed to the classification engine. A lot of those notes would be similar for all CELI engines.
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java Sat May 19 10:27:15 2012
@@ -1,7 +1,13 @@
package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.createTextEnhancement;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.SKOS_CONCEPT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
import java.io.IOException;
import java.net.URL;
@@ -13,11 +19,14 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.Vector;
+import javax.xml.soap.SOAPException;
+
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NoConvertorException;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -37,6 +46,8 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
@@ -52,7 +63,8 @@ public class CeliClassificationEnhanceme
* This ensures that no connections to external services are made if Stanbol is started in offline mode
* as the OnlineMode service will only be available if OfflineMode is deactivated.
*/
- @Reference
+ @SuppressWarnings("unused") //it's not unused!
+ @Reference
private OnlineMode onlineMode;
private static List<String> supportedLangs = new Vector<String>();
@@ -66,6 +78,10 @@ public class CeliClassificationEnhanceme
supportedLangs.add("pl");
supportedLangs.add("nl");
}
+ /**
+ * The literal factory used to create types literals
+ */
+ private LiteralFactory literalFactory = LiteralFactory.getInstance();
/**
* The literal representing the LangIDEngine as creator.
@@ -77,10 +93,16 @@ public class CeliClassificationEnhanceme
* {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
+ /**
+ * Currently used as fise:entity-type for TopicAnnotations
+ */
+ private static final UriRef OWL_CLASS = new UriRef("http://www.w3.org/2002/07/owl#Class");
+
private Logger log = LoggerFactory.getLogger(getClass());
- private String language = null;
+ //NOTE: one CAN NOT store the language as member, as EnhancementEngines
+ // can be called in parallel by multiple threads!
+ //private String language = null;
/**
* This contains the only MIME type directly supported by this enhancement
@@ -108,7 +130,8 @@ public class CeliClassificationEnhanceme
@Activate
protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
super.activate(ctx);
- Dictionary<String, Object> properties = ctx.getProperties();
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ctx.getProperties();
this.licenseKey = (String) properties.get(LICENSE_KEY);
if (licenseKey == null || licenseKey.isEmpty()) {
log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
@@ -129,63 +152,114 @@ public class CeliClassificationEnhanceme
@Override
public int canEnhance(ContentItem ci) throws EngineException {
- this.language = EnhancementEngineHelper.getLanguage(ci);
- if (language == null) {
- throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
- }
-
- if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ //canEnhance should inform if it can not enhance a ContentItem because
+ //of an potential error in the EnhancementChain configuration, but not
+ //throw runtime exceptions.
+// if (language == null) {
+// throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+// }
+ if(language==null) {
+ log.warn("Unable to enhance ContentItem {} because language of the Content is unknown." +
+ " Please check that a language identification engine is active in this EnhancementChain.",
+ ci.getUri());
+ }
+
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language)) {
+ //NOTE: ENHANCE_ASYNC indicates that the computeEnhancements Method
+ // correctly applies read/write locks to the contentItem
return ENHANCE_ASYNC;
- else
+ } else {
return CANNOT_ENHANCE;
+ }
}
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
- if (this.language == null)
- this.language = EnhancementEngineHelper.getLanguage(ci);
-
+ //NOTE: in the computeEnhancements Method on can check metadata already
+ // checked within the canEnhance method. THis is not required, but it
+ // may help to identify potential bugs in the EnhancementJobManager
+ // implementation
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if (!isLangSupported(language)){
+ throw new IllegalStateException("Call to computeEnhancement with unsupported language '"
+ +language+" for ContentItem "+ ci.getUri() +": This is also checked "
+ + "in the canEnhance method! -> This indicated an Bug in the "
+ + "implementation of the " + "EnhancementJobManager!");
+ }
Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
- throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ throw new IllegalStateException("No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE + "' found for ContentItem "
+ + ci.getUri() + ": This is also checked in the canEnhance "
+ + "method! -> This indicates an Bug in the implementation of "
+ + "the EnhancementJobManager!");
}
- String text = "";
+ String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
- log.info("No text contained in ContentPart {"+contentPart.getKey()+"} of ContentItem {"+ci.getUri()+"}");
+ log.info("No text contained in ContentPart {} of ContentItem {}",
+ contentPart.getKey(),ci.getUri());
return;
}
-
+ //NOTE: EnhancementEngine implementations should pass all Exceptions
+ // (RuntimeExceptions as is and others wrapped as EngineExceptions).
+ // The EnhancementJobManager implementation has to catch and
+ // process all those. Handling depends on the configuration of the
+ // EnhancementChain (e.g. if this engine is optional enhancement of
+ // the ContentItem will continue).
+ // This is important as otherwise Users would get "200 ok" replies
+ // for failed enhancement requests that have failed!
+ //
+ // This means that:
+ // * Http clients should pass on IOExceptions and SOAPExceptions
+ // * No try/catch that also includes RuntimeExceptions
+ List<Concept> lista;
try {
-
- List<Concept> lista = this.client.extractConcepts(text, language);
- LiteralFactory literalFactory = LiteralFactory.getInstance();
-
- MGraph g = ci.getMetadata();
-
- UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
-
- for (Concept ne : lista) {
- List<UriRef> uris = this.getEntityRefForType(ne.getClassLabel());
-
- try {
- for (UriRef uri : uris)
- g.add(new TripleImpl(textAnnotation, DC_RELATION, uri));
- g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(ne.getConfidence())));
- } catch (NoConvertorException e) {
- log.error(e.getMessage(),e);
- }
- }
- } catch (Exception e) {
- log.error(e.getMessage(),e);
+ lista = this.client.extractConcepts(text, language);
+ } catch (IOException e) { //re-throw exceptions as EngineException
+ throw new EngineException("Error while calling the CELI classification"
+ +" service (configured URL: " +serviceURL+")!",e);
+ } catch (SOAPException e) {
+ throw new EngineException("Error wile encoding/decoding the request/"
+ +"response to the CELI classification service!",e);
+ }
+ if(lista.isEmpty()){ //not topics found
+ return; //nothing to do
+ }
+ MGraph g = ci.getMetadata();
+ //NOTE: EnhancementEngines that use "ENHANCE_ASYNC" need to acquire a
+ // writeLock before modifications to the enhancement metadata
+ ci.getLock().writeLock().lock();
+ try {
+ //see STANBOL-617 for rules how to encode extracted topics
+ //we need a single TextAnnotation to link all TopicAnnotations
+ UriRef textAnnotation = createTextEnhancement(ci, this);
+ // add the dc:type skos:Concept
+ g.add(new TripleImpl(textAnnotation, DC_TYPE, SKOS_CONCEPT));
+
+ //not create the fise:TopicAnnotations
+ for (Concept ne : lista) {
+ UriRef topicAnnotation = EnhancementEngineHelper.createTopicEnhancement(ci, this);
+ g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_REFERENCE, ne.getUri()));
+ g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_LABEL,
+ new PlainLiteralImpl(ne.getLabel())));
+ //TODO: currently I use owl:class as entity-type, because that is
+ // what the linked dbpedia ontology resources are.
+ g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_TYPE, OWL_CLASS));
+ g.add(new TripleImpl(topicAnnotation, ENHANCER_CONFIDENCE,
+ literalFactory.createTypedLiteral(ne.getConfidence())));
+ //link to the TextAnnotation
+ g.add(new TripleImpl(topicAnnotation, DC_RELATION, textAnnotation));
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
}
-
}
private boolean isLangSupported(String language) {
@@ -195,14 +269,6 @@ public class CeliClassificationEnhanceme
return false;
}
- private List<UriRef> getEntityRefForType(String classificationLabels) {
- List<UriRef> refs = new Vector<UriRef>();
- String[] tmps = classificationLabels.split(" ");
- for (String dbPediaLabel : tmps) {
- refs.add(new UriRef(NamespaceEnum.dbpedia_ont + dbPediaLabel));
- }
- return refs;
- }
@Override
public Map<String, Object> getServiceProperties() {
Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java Sat May 19 10:27:15 2012
@@ -1,23 +1,35 @@
package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Vector;
import javax.xml.soap.MessageFactory;
import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPException;
import javax.xml.soap.SOAPMessage;
import javax.xml.soap.SOAPPart;
import javax.xml.transform.stream.StreamSource;
+import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.util.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
@@ -25,108 +37,258 @@ import org.w3c.dom.NodeList;
public class ClassificationClientHTTP {
- private final Logger log = LoggerFactory.getLogger(getClass());
-
+ private final static Logger log = LoggerFactory.getLogger(ClassificationClientHTTP.class);
+ //NOTE: Defining charset, content-type and SOAP prefix/suffix as
+ // constants does make more easy to configure those things
+ /**
+ * The UTF-8 {@link Charset}
+ */
+ private static final Charset UTF8 = Charset.forName("UTF-8");
+ /**
+ * The content type "text/xml; charset={@link #UTF8}"
+ */
+ private static final String CONTENT_TYPE = "text/xml; charset="+UTF8.name();
+ /**
+ * The XML version, encoding; SOAP envelope, heder and starting element of the body;
+ * processTextRequest and text starting element.
+ */
+ private static final String SOAP_PREFIX = "<?xml version=\"1.0\" encoding=\""+UTF8.name()+"\"?>"
+ + "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" "
+ + "xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body>";
+ /**
+ * closes the text, processTextRequest, SOAP body and envelope
+ */
+ private static final String SOAP_SUFFIX = "</soapenv:Body></soapenv:Envelope>";
+
+ //TODO: This should be configurable
private static final int maxResultToReturn = 3;
- private URL serviceEP;
- private String licenseKey;
+ private final URL serviceEP;
+ private final String licenseKey;
+
+ //NOTE: the request headers are the same for all request - so they can be
+ // initialized in the constructor.
+ private final Map<String,String> requestHeaders;
public ClassificationClientHTTP(URL serviceUrl, String licenseKey){
this.serviceEP=serviceUrl;
this.licenseKey=licenseKey;
+ Map<String,String> headers = new HashMap<String,String>();
+ headers.put("Content-Type", CONTENT_TYPE);
+ if(licenseKey != null){
+ String encoded = Base64.encode(this.licenseKey.getBytes(UTF8));
+ headers.put("Authorization", "Basic "+encoded);
+ }
+ this.requestHeaders = Collections.unmodifiableMap(headers);
}
-
- public String doPostRequest(URL url, String body) throws IOException {
-
- HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
- urlConn.setRequestMethod("POST");
- urlConn.setDoInput(true);
- if (null != body) {
- urlConn.setDoOutput(true);
- } else {
- urlConn.setDoOutput(false);
- }
- urlConn.setUseCaches(false);
- String contentType = "text/xml; charset=utf-8";
- urlConn.setRequestProperty("Content-Type", contentType);
- if(this.licenseKey!=null){
- String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
- urlConn.setRequestProperty("Authorization", "Basic "+encoded);
- }
-
- // send POST output
- if (null != body) {
- OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
- printout.write(body);
- printout.flush();
- printout.close();
- }
-
- //close connection
- urlConn.disconnect();
-
- // get response data
- return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
- }
-
-
- public List<Concept> extractConcepts(String text,String lang) {
- List<Concept> extractedConcepts = new Vector<Concept>();
+ /*
+ * NOTE: parsing/returning a String requires to create in-memory copies
+ * of the sent/received data. Imaging users that send the text of
+ * 100 pages PDF files to the Stanbol Enhancer.
+ * Because of that an implementation that directly streams the
+ * StringEscapeUtils.escapeXml(..) to the request is preferable
+ *
+ * This will no longer allow to debug the data of the request and
+ * response. See the commented main method at the end for alternatives
+ */
+// public String doPostRequest(URL url, String body) throws IOException {
+//
+// HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
+// urlConn.setRequestMethod("POST");
+// urlConn.setDoInput(true);
+// if (null != body) {
+// urlConn.setDoOutput(true);
+// } else {
+// urlConn.setDoOutput(false);
+// }
+// urlConn.setUseCaches(false);
+// String contentType = "text/xml; charset=utf-8";
+// urlConn.setRequestProperty("Content-Type", contentType);
+// if(this.licenseKey!=null){
+// String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
+// urlConn.setRequestProperty("Authorization", "Basic "+encoded);
+// }
+//
+// // send POST output
+// if (null != body) {
+// OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+// printout.write(body);
+// printout.flush();
+// printout.close();
+// }
+//
+// //close connection
+// urlConn.disconnect();
+//
+// // get response data
+// return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
+// }
+
+
+ //NOTE: forward IOException and SOAPExceptions to allow correct error handling
+ // by the EnhancementJobManager.
+ // Also RuntimeExceptions MUST NOT be cached out of the same reason!
+ public List<Concept> extractConcepts(String text,String lang) throws IOException, SOAPException {
+ if(text == null || text.isEmpty()){
+ //no text -> no classification
+ return Collections.emptyList();
+ }
+
+ //create the POST request
+ HttpURLConnection con = Utils.createPostRequest(serviceEP, requestHeaders);
+ //"stream" the request content directly to the buffered writer
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(con.getOutputStream(),UTF8));
+ writer.write(SOAP_PREFIX);
+ writer.write("<clas:classify>");
+ writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be configurable?
+ writer.write("<clas:model>");
+ writer.write(lang);
+ writer.write("</clas:model>");
+ writer.write("<clas:text>");
+ StringEscapeUtils.escapeXml(writer, text); //write the escaped text directly to the request
+ writer.write("</clas:text>");
+ writer.write("</clas:classify>");
+ writer.write(SOAP_SUFFIX);
+ writer.close();
+
+ //Call the service
+ long start = System.currentTimeMillis();
+ InputStream stream = con.getInputStream();
+ log.debug("Request to {} took {}ms",serviceEP,System.currentTimeMillis()-start);
+
+ //NOTE: forward IOException and SOAPExceptions to allow correct error handling
+ // by the EnhancementJobManager.
+ // Also RuntimeExceptions MUST NOT be cached out of the same reason!
+
+// try {
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // NOTE: directly use the InputStream provided by the URLConnection!
+// ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ List<Concept> extractedConcepts = new Vector<Concept>();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+ HashSet<String> inserted=new HashSet<String>();
+ for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
+ //NOTE: do not catch RuntimeExceptions. Error handling is done by
+ // the EnhancementJobManager!
+// try {
+ Element result = (Element) nlist.item(i);
+
+ //NOTE: (rwesten) implemented a mapping from the CELI classification
+ // to the Stanbol fise:TopicEnhancements (STANBOL-617) that
+ // * one fise:TopicAnnotation is generated per "model"
+ // * the whole label string is used as fise:entity-label
+ // * the uri of the most specific dbpedia ontology type (see
+ // selectClassificationClass) is used as fise:entity-reference
+ // This has the intuition that for users it is easier to grasp
+ // the meaning of the whole lable, while for machines the link
+ // to the most specific dbpedia ontology class is best suited.
+ String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
+ model=model.substring(1, model.length()-1);
+ UriRef modelConcept = selectClassificationClass(model);
+ String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
+ Double confidence= new Double(conf);
+ extractedConcepts.add(new Concept(model,modelConcept,confidence));
+// } catch (Exception e) {
+// e.printStackTrace();
+// }
- try {
- String txt = StringEscapeUtils.escapeXml(text);
- String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body> <clas:classify>"
- +"<clas:user>wiki</clas:user><clas:model>"+lang+"</clas:model><clas:text>"+txt+"</clas:text></clas:classify></soapenv:Body></soapenv:Envelope>";
-
-
- String responseXml = doPostRequest(this.serviceEP, xmldata);
- log.debug(responseXml);
-
- // Create SoapMessage
- MessageFactory msgFactory = MessageFactory.newInstance();
- SOAPMessage message = msgFactory.createMessage();
- SOAPPart soapPart = message.getSOAPPart();
-
- // Load the SOAP text into a stream source
- ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
- StreamSource source = new StreamSource(stream);
-
- // Set contents of message
- soapPart.setContent(source);
-
- SOAPBody soapBody = message.getSOAPBody();
- NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
- HashSet<String> inserted=new HashSet<String>();
- for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
- try {
- Element result = (Element) nlist.item(i);
-
- String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
- model=model.substring(1, model.length()-1);
- String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
- float confidence=Float.parseFloat(conf);
-
- String[] tmps=model.split(" ");
-
- for(String t: tmps){
- if(!inserted.contains(t)){
- extractedConcepts.add(new Concept(t, confidence));
- inserted.add(t);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- }
- } catch (Exception e) {
- e.printStackTrace();
}
-
+// } catch (Exception e) {
+// e.printStackTrace();
+// }
return extractedConcepts;
}
-
+ /**
+ * TopicClassifications require only a single fise:entity-reference.
+ * However the CELI classification service delivers <p>
+ * <code><pre>
+ * <ns2:label>[Organisation HockeyTeam SportsTeam]</ns2:label>
+ * </pre></code>
+ * because of that this method needs to select one of the labels.<p>
+ * This method currently selects the 2nd token if there are more than one
+ * concept suggestions included. NOTE that the whole literal is used as
+ * fise:entity-label!
+ * @param classificationLabels the label string
+ * @return the selected label
+ */
+ private UriRef selectClassificationClass(String classificationLabels) {
+ //NOTE: (rwesten) In general it would be better if CELI could provide
+ // de-referenceable URLs for those suggestions.
+ // If that is possible one would no longer need to link to the
+ // most specific dbpedia ontology class for a category e.g.
+ // http://dbpedia.org/ontology/HockeyTeam
+ // for
+ // [Organisation HockeyTeam SportsTeam]
+ // but e.g.
+ // http://linguagrid.org/category/HockeyTeam
+ // meaning the linguagrid could provide categories as skos thesaurus
+ // via it's web interface
+ int start = classificationLabels.charAt(0) == '[' ? 1 : 0;
+ int end = classificationLabels.charAt(classificationLabels.length()-1) == ']' ?
+ classificationLabels.length() - 1 : classificationLabels.length();
+ String[] tmps = classificationLabels.substring(start, end).split(" ");
+ return new UriRef(NamespaceEnum.dbpedia_ont.getNamespace()+ //the namespace
+ (tmps.length > 1 ? tmps[1] : tmps[0])); //the Class for the label
+ }
+
+ //NOTE: If you stream the contents directly to the stream, you can no longer
+ // debug the request/response. Because of that it is sometimes
+ // helpful to have a main method for those tests
+ // An even better variant would be to write a UnitTest for that!!
+ // This would be recommended of the called service is still in beta
+ // and may change at any time
+// public static void main(String[] args) throws Exception {
+// String lang = "fr";
+// String text = "Brigitte Bardot, née le 28 septembre " +
+// "1934 à Paris, est une actrice de cinéma et chanteuse française.";
+//
+// //For request testing
+// //Writer request = new StringWriter();
+//
+// //For response testing
+// HttpURLConnection con = Utils.createPostRequest(
+// new URL("http://linguagrid.org/LSGrid/ws/dbpedia-classification"),
+// Collections.singletonMap("Content-Type", CONTENT_TYPE));
+// Writer request = new OutputStreamWriter(con.getOutputStream(),UTF8);
+//
+// //"stream" the request content directly to the buffered writer
+// BufferedWriter writer = new BufferedWriter(request);
+//
+// writer.write(SOAP_PREFIX);
+// writer.write("<clas:classify>");
+// writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be configurable?
+// writer.write("<clas:model>");
+// writer.write(lang);
+// writer.write("</clas:model>");
+// writer.write("<clas:text>");
+// StringEscapeUtils.escapeXml(writer, text); //write the escaped text directly to the request
+// writer.write("</clas:text>");
+// writer.write("</clas:classify>");
+// writer.write(SOAP_SUFFIX);
+// writer.close();
+//
+// //log the Request (if request testing)
+// //log.info("Request \n{}",request.toString());
+//
+// //for response testing we need to call the service
+// //Call the service
+// long start = System.currentTimeMillis();
+// InputStream stream = con.getInputStream();
+// log.info("Request to took {}ms",System.currentTimeMillis()-start);
+// log.info("Response:\n{}",IOUtils.toString(stream));
+// stream.close();
+// }
}
Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java Sat May 19 10:27:15 2012
@@ -1,28 +1,34 @@
package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+import org.apache.clerezza.rdf.core.UriRef;
+
public class Concept {
- private String classLabel;
- private float confidence;
+ private final String label;
+ private final UriRef uri;
+ private final Double confidence;
- public Concept(String classLabel, float confidence) {
+ public Concept(String label, UriRef uri,Double confidence) {
super();
- this.classLabel = classLabel;
+ this.label = label;
+ this.uri = uri;
this.confidence = confidence;
}
- public String getClassLabel() {
- return classLabel;
- }
- public void setClassLabel(String classLabel) {
- this.classLabel = classLabel;
- }
- public float getConfidence() {
+
+ public Double getConfidence() {
return confidence;
}
- public void setConfidence(float confidence) {
- this.confidence = confidence;
- }
+
+
+ public String getLabel() {
+ return label;
+ }
+
+
+ public UriRef getUri() {
+ return uri;
+ }
}
Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java Sat May 19 10:27:15 2012
@@ -75,7 +75,6 @@ public class NERserviceClientHTTP {
//no text -> no extractions
return Collections.emptyList();
}
- List<NamedEntity> extractedNE = new Vector<NamedEntity>();
//create the POST request
HttpURLConnection con = Utils.createPostRequest(serviceEP, requestHeaders);
@@ -102,6 +101,9 @@ public class NERserviceClientHTTP {
soapPart.setContent(source);
SOAPBody soapBody = message.getSOAPBody();
+
+ //extract the results
+ List<NamedEntity> extractedNE = new Vector<NamedEntity>();
NodeList nlist = soapBody.getElementsByTagName("result");
for (int i = 0; i < nlist.getLength(); i++) {
Element result = (Element) nlist.item(i);
Modified: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java Sat May 19 10:27:15 2012
@@ -1,18 +1,25 @@
package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+import static junit.framework.Assert.assertEquals;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
+import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTopicAnnotations;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.Dictionary;
+import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
+import junit.framework.Assert;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
@@ -25,6 +32,8 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -76,11 +85,15 @@ public class CeliClassificationEnhanceme
classificationEngine.computeEnhancements(ci);
TestUtils.logEnhancements(ci);
-
- int textAnnoNum = checkAllTextAnnotations(ci.getMetadata(), TEXT);
- log.info(textAnnoNum + " TextAnnotations found ...");
- int entityAnnoNum = checkAllEntityAnnotations(ci.getMetadata());
- log.info(entityAnnoNum + " EntityAnnotations found ...");
+ HashMap<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
+ expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
+ expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(
+ classificationEngine.getClass().getName()));
+
+ int textAnnoNum = EnhancementStructureHelper.validateAllTextAnnotations(ci.getMetadata(), TEXT,expectedValues);
+ assertEquals("Only a single fise:TextAnnotation is expeted", 1, textAnnoNum);
+ int numTopicAnnotations = validateAllTopicAnnotations(ci.getMetadata() , expectedValues);
+ assertTrue("No TpocisAnnotations found", numTopicAnnotations > 0);
} catch (EngineException e) {
if (e.getCause() != null && e.getCause() instanceof UnknownHostException) {
log.warn("Celi Service not reachable -> offline? -> deactivate test");
@@ -90,25 +103,4 @@ public class CeliClassificationEnhanceme
}
}
- private int checkAllEntityAnnotations(MGraph g) {
- Iterator<Triple> entityAnnotationIterator = g.filter(null, RDF_TYPE, ENHANCER_ENTITYANNOTATION);
- int entityAnnotationCount = 0;
- while (entityAnnotationIterator.hasNext()) {
- UriRef entityAnnotation = (UriRef) entityAnnotationIterator.next().getSubject();
- entityAnnotationCount++;
- }
- return entityAnnotationCount;
- }
-
- private int checkAllTextAnnotations(MGraph g, String content) {
- Iterator<Triple> textAnnotationIterator = g.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
- // test if a textAnnotation is present
- assertTrue(textAnnotationIterator.hasNext());
- int textAnnotationCount = 0;
- while (textAnnotationIterator.hasNext()) {
- UriRef textAnnotation = (UriRef) textAnnotationIterator.next().getSubject();
- textAnnotationCount++;
- }
- return textAnnotationCount;
- }
}