You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/02/02 14:52:28 UTC
svn commit: r1239618 [1/2] - in /incubator/stanbol/trunk/enhancer: engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/ engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/ engin...

Author: rwesten
Date: Thu Feb  2 13:52:27 2012
New Revision: 1239618

URL: http://svn.apache.org/viewvc?rev=1239618&view=rev
Log:
STANBOL-478: Metaxa Engine no adds the extracted plain text as Blob to the ContentItem. All other engines retrieve content from Blobs.
STANBOL-479: All current (and not deprecated) Engines do now support async processing
STANBOL-46: Asynchronous enhancements are now finally supported by both the EnhancementJobManager and the EnhancementEngines.

Other changes:

* Added two utility methods to the ContentItemHelper that allow to search for Blobs based on mime types and to retrieve the text from a Blob.
* Metaxa Engine allows now to configure a list of mime types that are ingored. This is basically to avaid processing of plain/text content, but could also be used by users to deactivate other mime types.
* geonames Engine: changed most of the logging to the level debug. In addition this engine does no use the correct logger
 

Added:
    incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java   (with props)
Modified:
    incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
    incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
    incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java
    incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
    incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java
    incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java
    incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java
    incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java
    incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
    incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java

Modified: incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java Thu Feb  2 13:52:27 2012
@@ -17,20 +17,18 @@
 package org.apache.stanbol.enhancer.engines.autotagging.impl;
 
 import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.PROPERTY_NAME;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Iterator;
 import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
-import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
@@ -39,11 +37,13 @@ import org.apache.felix.scr.annotations.
 import org.apache.stanbol.autotagging.Autotagger;
 import org.apache.stanbol.autotagging.TagInfo;
 import org.apache.stanbol.enhancer.engines.autotagging.AutotaggerProvider;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -64,7 +64,7 @@ import org.slf4j.LoggerFactory;
 public class RelatedTopicEnhancementEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements EnhancementEngine {
 
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
-
+    protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
     public static final String DEFAULT_NAME = "autotaggingRelatedTopic";
     
     private static final Logger log = LoggerFactory.getLogger(RelatedTopicEnhancementEngine.class);
@@ -87,25 +87,25 @@ public class RelatedTopicEnhancementEngi
                     + ci.getUri().getUnicodeString());
             return;
         }
-        String mimeType = ci.getMimeType().split(";", 2)[0];
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+        if(contentPart == null){
+            throw new IllegalStateException("No ContentPart with a supported Mime Type"
+                    + "found for ContentItem "+ci.getUri()+"(supported: '"
+                    + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" 
+                    + "NOT called and indicates a bug in the used EnhancementJobManager!");
+        }
         String text = "";
-        if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
-            try {
-                text = IOUtils.toString(ci.getStream(),"UTF-8");
-            } catch (IOException e) {
-                throw new InvalidContentException(this, ci, e);
-            }
-        } else {
-            Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getUri().getUnicodeString()), NIE_PLAINTEXTCONTENT, null);
-            while (it.hasNext()) {
-                text += it.next().getObject();
-            }
+        try {
+            text = ContentItemHelper.getText(contentPart.getValue());
+        } catch (IOException e) {
+            throw new InvalidContentException(this, ci, e);
         }
         if (text.trim().length() == 0) {
             // TODO: make the length of the data a field of the ContentItem
             // interface to be able to filter out empty items in the canEnhance
             // method
-            log.warn("nothing to extract a topic from");
+            log.warn("ContentPart {} of ContentItem {} does contain no text to extract a topic from",
+                contentPart.getKey(),ci.getUri());
             return;
         }
 
@@ -115,10 +115,16 @@ public class RelatedTopicEnhancementEngi
         try {
             List<TagInfo> suggestions = autotagger.suggestForType(text, type);
             Collection<NonLiteral> noRelatedEnhancements = Collections.emptyList();
-            for (TagInfo tag : suggestions) {
-                EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory,
-                        graph, contentItemId,
-                        noRelatedEnhancements, tag);
+            //Acquire a write lock while writing the enhancement results
+            ci.getLock().writeLock().lock();
+            try {
+                for (TagInfo tag : suggestions) {
+                    EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory,
+                            graph, contentItemId,
+                            noRelatedEnhancements, tag);
+                }
+            } finally {
+                ci.getLock().writeLock().unlock();
             }
         } catch (IOException e) {
             throw new EngineException(this, ci, e);
@@ -126,17 +132,11 @@ public class RelatedTopicEnhancementEngi
     }
 
     public int canEnhance(ContentItem ci) {
-           String mimeType = ci.getMimeType().split(";",2)[0];
-        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
-            return ENHANCE_SYNCHRONOUS;
-        }
-        // check for existence of textual content in metadata
-        UriRef subj = new UriRef(ci.getUri().getUnicodeString());
-        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
-        if (it.hasNext()) {
-            return ENHANCE_SYNCHRONOUS;
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
+            return ENHANCE_ASYNC; //RelatedTopic engine now supports async processing
+        } else {
+            return CANNOT_ENHANCE;
         }
-        return CANNOT_ENHANCE;
     }
 
     public void bindAutotaggerProvider(AutotaggerProvider autotaggerProvider) {

Added: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java?rev=1239618&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java Thu Feb  2 13:52:27 2012
@@ -0,0 +1,93 @@
+package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+
+import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.TripleCollection;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class NamedEntity {
+    private static final Logger log = LoggerFactory.getLogger(NamedEntity.class);
+    private final NonLiteral entity;
+    private final String name;
+    private final UriRef type;
+    private NamedEntity(NonLiteral entity, String name, UriRef type) {
+        this.entity = entity;
+        this.name = name;
+        this.type = type;
+    }
+    /**
+     * Getter for the Node providing the information about that entity
+     * @return the entity
+     */
+    public final NonLiteral getEntity() {
+        return entity;
+    }
+    /**
+     * Getter for the name
+     * @return the name
+     */
+    public final String getName() {
+        return name;
+    }
+    /**
+     * Getter for the type
+     * @return the type
+     */
+    public final UriRef getType() {
+        return type;
+    }
+    @Override
+    public int hashCode() {
+        return entity.hashCode();
+    }
+    @Override
+    public boolean equals(Object o) {
+        return o instanceof NamedEntity && entity.equals(((NamedEntity)o).entity);
+    }
+    @Override
+    public String toString() {
+        return String.format("NamedEntity %s (name=%s|type=%s)",entity,name,type);
+    }
+    /**
+     * Extracts the information of an {@link NamedEntity} from an
+     * {@link TechnicalClasses#ENHANCER_TEXTANNOTATION} instance.
+     * @param graph the graph with the information
+     * @param textAnnotation the text annotation instance
+     * @return the {@link NamedEntity} or <code>null</code> if the parsed
+     * text annotation is missing required information.
+     */
+    public static NamedEntity createFromTextAnnotation(TripleCollection graph, NonLiteral textAnnotation){
+        String name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT);
+        if (name == null) {
+            log.debug("Unable to create NamedEntity for TextAnnotation {} "
+                    + "because property {} is not present",textAnnotation,ENHANCER_SELECTED_TEXT);
+            return null;
+        }
+        name = name.trim();
+        if(name.isEmpty()){
+            log.debug("Unable to process TextAnnotation {} because its selects "
+            		+ "an empty Stirng !",textAnnotation);
+            return null;
+        }
+        UriRef type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE);
+        if (type == null) {
+            log.warn("Unable to process TextAnnotation {} because property {}"
+                     + " is not present!",textAnnotation, DC_TYPE);
+            return null;
+        }
+        // remove punctuation form the search string
+        return new NamedEntity(textAnnotation,cleanupKeywords(name),type);
+    }        
+    /**
+     * Removes punctuation form a parsed string
+     */
+    private static String cleanupKeywords(String keywords) {
+        return keywords.replaceAll("\\p{P}", " ").trim();
+    }
+}
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java Thu Feb  2 13:52:27 2012
@@ -17,8 +17,6 @@
 package org.apache.stanbol.enhancer.engines.entitytagging.impl;
 
 import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 
 import java.util.ArrayList;
@@ -28,12 +26,12 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Triple;
-import org.apache.clerezza.rdf.core.TripleCollection;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -52,7 +50,6 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
@@ -318,36 +315,72 @@ public class NamedEntityTaggingEngine 
         } else { // null indicates to use the Entityhub to lookup Entities
             site = null;
         }
-        UriRef contentItemId = ci.getUri();
-
         MGraph graph = ci.getMetadata();
         LiteralFactory literalFactory = LiteralFactory.getInstance();
-
-        // Retrieve the existing text annotations
-        Map<UriRef,List<UriRef>> textAnnotations = new HashMap<UriRef,List<UriRef>>();
-        for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it
-                .hasNext();) {
-            UriRef uri = (UriRef) it.next().getSubject();
-            if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
-                // this is not the most specific occurrence of this name: skip
-                continue;
-            }
-            // This is a first occurrence, collect any subsumed annotations
-            List<UriRef> subsumed = new ArrayList<UriRef>();
-            for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext();) {
-                subsumed.add((UriRef) it2.next().getSubject());
+        // Retrieve the existing text annotations (requires read lock)
+        Map<NamedEntity,List<UriRef>> textAnnotations = new HashMap<NamedEntity,List<UriRef>>();
+        ci.getLock().readLock().lock();
+        try {
+            for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it
+                    .hasNext();) {
+                UriRef uri = (UriRef) it.next().getSubject();
+                if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
+                    // this is not the most specific occurrence of this name: skip
+                    continue;
+                }
+                NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
+                if(namedEntity != null){
+                    // This is a first occurrence, collect any subsumed annotations
+                    List<UriRef> subsumed = new ArrayList<UriRef>();
+                    for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext();) {
+                        subsumed.add((UriRef) it2.next().getSubject());
+                    }
+                    textAnnotations.put(namedEntity, subsumed);
+                }
             }
-            textAnnotations.put(uri, subsumed);
+        } finally {
+            ci.getLock().readLock().unlock();
         }
-
-        for (Map.Entry<UriRef,List<UriRef>> entry : textAnnotations.entrySet()) {
+        //search the suggestions
+        Map<NamedEntity,List<Entity>> suggestions = new HashMap<NamedEntity,List<Entity>>(textAnnotations.size());
+        for (Entry<NamedEntity,List<UriRef>> entry : textAnnotations.entrySet()) {
             try {
-                computeEntityRecommentations(site,literalFactory, graph, contentItemId, entry.getKey(),
-                    entry.getValue());
+                List<Entity> entitySuggestions = computeEntityRecommentations(
+                    site, entry.getKey(),entry.getValue());
+                if(entitySuggestions != null && !entitySuggestions.isEmpty()){
+                    suggestions.put(entry.getKey(), entitySuggestions);
+                }
             } catch (EntityhubException e) {
                 throw new EngineException(this, ci, e);
             }
         }
+        //now write the results (requires write lock)
+        ci.getLock().writeLock().lock();
+        try {
+            RdfValueFactory factory = RdfValueFactory.getInstance();
+            Map<String, Representation> entityData = new HashMap<String,Representation>();
+            for(Entry<NamedEntity,List<Entity>> entitySuggestions : suggestions.entrySet()){
+                List<UriRef> subsumed = textAnnotations.get(entitySuggestions.getKey());
+                List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>(subsumed);
+                annotationsToRelate.add(entitySuggestions.getKey().getEntity());
+                for(Entity suggestion : entitySuggestions.getValue()){
+                    log.debug("Add Suggestion {} for {}", suggestion.getId(), entitySuggestions.getKey());
+                    EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(),
+                        annotationsToRelate, suggestion.getRepresentation(), nameField);
+                    if (dereferenceEntities) {
+                        entityData.put(suggestion.getId(), suggestion.getRepresentation());
+                    }
+                }
+            }
+            //if dereferneceEntities is true the entityData will also contain all
+            //Representations to add! If false entityData will be empty
+            for(Representation rep : entityData.values()){
+                graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
+            }
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
+
     }
 
     /**
@@ -355,7 +388,6 @@ public class NamedEntityTaggingEngine 
      * @param site The {@link ReferencedSiteException} id or <code>null</code> to
      * use the {@link Entityhub}
      * @param literalFactory the {@link LiteralFactory} used to create RDF Literals
-     * @param graph the graph to write the lined entities
      * @param contentItemId the id of the contentItem
      * @param textAnnotation the text annotation to enhance
      * @param subsumedAnnotations other text annotations for the same entity 
@@ -363,42 +395,19 @@ public class NamedEntityTaggingEngine 
      * @throws EntityhubException On any Error while looking up Entities via
      * the Entityhub
      */
-    protected final Iterable<Entity> computeEntityRecommentations(ReferencedSite site,
-            LiteralFactory literalFactory,
-            MGraph graph,
-            UriRef contentItemId,
-            UriRef textAnnotation,
+    protected final List<Entity> computeEntityRecommentations(ReferencedSite site,
+            NamedEntity namedEntity,
             List<UriRef> subsumedAnnotations) throws EntityhubException {
         // First get the required properties for the parsed textAnnotation
         // ... and check the values
-        String name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT);
-        if (name == null) {
-            log.info("Unable to process TextAnnotation " + textAnnotation + " because property"
-                     + ENHANCER_SELECTED_TEXT + " is not present");
-            return Collections.emptyList();
-        }
-        if(name.isEmpty()){
-            log.info("Unable to process TextAnnotation " + textAnnotation + 
-                " because an empty Stirng is selected by " + ENHANCER_SELECTED_TEXT + "");
-            return Collections.emptyList();
-        }
-
-        UriRef type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE);
-        if (type == null) {
-            log.warn("Unable to process TextAnnotation " + textAnnotation + " because property" + DC_TYPE
-                     + " is not present");
-            return Collections.emptyList();
-        }
-        // remove punctuation form the search string
-        name = cleanupKeywords(name);
 
-        log.debug("Process TextAnnotation " + name + " type=" + type);
+        log.debug("Process {}", namedEntity);
         FieldQuery query = site == null ? //if site is NULL use the Entityhub
                 entityhub.getQueryFactory().createFieldQuery() : 
                     site.getQueryFactory().createFieldQuery();
         // replace spaces with plus to create an AND search for all words in the name!
-        query.setConstraint(nameField, new TextConstraint(name));// name.replace(' ', '+')));
-        if (OntologicalClasses.DBPEDIA_PERSON.equals(type)) {
+        query.setConstraint(nameField, new TextConstraint(namedEntity.getName()));// name.replace(' ', '+')));
+        if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
             if (personState) {
                 if (personType != null) {
                     query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType));
@@ -408,7 +417,7 @@ public class NamedEntityTaggingEngine 
                 // ignore people
                 return Collections.emptyList();
             }
-        } else if (DBPEDIA_ORGANISATION.equals(type)) {
+        } else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) {
             if (orgState) {
                 if (orgType != null) {
                     query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType));
@@ -418,7 +427,7 @@ public class NamedEntityTaggingEngine 
                 // ignore people
                 return Collections.emptyList();
             }
-        } else if (OntologicalClasses.DBPEDIA_PLACE.equals(type)) {
+        } else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) {
             if (this.placeState) {
                 if (this.placeType != null) {
                     query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType));
@@ -435,9 +444,6 @@ public class NamedEntityTaggingEngine 
                     site.findEntities(query); //else the referenced site
         log.debug("{} results returned by query {}", results.size(), query);
 
-        List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>();
-        annotationsToRelate.add(textAnnotation);
-        annotationsToRelate.addAll(subsumedAnnotations);
         Float maxScore = null;
         int exactCount = 0;
         List<Entity> matches = new ArrayList<Entity>(numSuggestions);
@@ -452,7 +458,7 @@ public class NamedEntityTaggingEngine 
             while(labels.hasNext() && !found){
                 Text label = labels.next();
                 if(label.getLanguage() == null || label.getLanguage().startsWith("en")){
-                    if(label.getText().equalsIgnoreCase(name)){
+                    if(label.getText().equalsIgnoreCase(namedEntity.getName())){
                         found = true;
                     }
                 }
@@ -464,7 +470,6 @@ public class NamedEntityTaggingEngine 
                 matches.add(guess);
             }
         }
-        RdfValueFactory factory = RdfValueFactory.getInstance();
         //now write the results
         for(int i=0;i<matches.size();i++){
             Representation rep = matches.get(i).getRepresentation();
@@ -477,15 +482,8 @@ public class NamedEntityTaggingEngine 
                         maxScore.doubleValue()+(score != null?score.doubleValue():0));
                 }
             }
-            log.debug("Adding {} to ContentItem {}", rep.getId(), contentItemId);
-            EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, contentItemId,
-                annotationsToRelate, rep, nameField);
-
-            if (dereferenceEntities) {
-                graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
-            }
         }
-        return results;
+        return matches;
     }
 
     public int canEnhance(ContentItem ci) {
@@ -493,7 +491,7 @@ public class NamedEntityTaggingEngine 
          * This engine consumes existing enhancements because of that it can enhance any type of ci! TODO: It
          * would also be possible to check here if there is an TextAnnotation and use that as result!
          */
-        return ENHANCE_SYNCHRONOUS;
+        return ENHANCE_ASYNC; //Entity tagging now supports asyc processing
     }
 
     @Override
@@ -502,10 +500,4 @@ public class NamedEntityTaggingEngine 
             (Object) defaultOrder));
     }
 
-    /**
-     * Removes punctuation form a parsed string
-     */
-    private static String cleanupKeywords(String keywords) {
-        return keywords.replaceAll("\\p{P}", " ").trim();
-    }
 }

Modified: incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java Thu Feb  2 13:52:27 2012
@@ -99,7 +99,7 @@ public class LocationEnhancementEngine 
 
     public static final Map<String, Collection<UriRef>> FEATURE_TYPE_CONCEPT_MAPPINGS;
 
-    private static final Logger log = LoggerFactory.getLogger(EnhancementEngineHelper.class);
+    private static final Logger log = LoggerFactory.getLogger(LocationEnhancementEngine.class);
 
     /**
      * Default value for minimum scores of search results are added to the
@@ -372,9 +372,9 @@ public class LocationEnhancementEngine 
             }
             if (results != null) {
                 for (Toponym result : results) {
-                    log.info("process result " + result.getGeoNameId() + " " + result.getName());
+                    log.debug("process result {} {}",result.getGeoNameId(),result.getName());
                     Double score = getToponymScore(result);
-                    log.info("  > score " + score);
+                    log.debug("  > score {}",score);
                     if (score != null) {
                         if (score < minScore) {
                             //if score is lower than the under bound, than stop
@@ -391,9 +391,9 @@ public class LocationEnhancementEngine 
                     //write the enhancement!
                     NonLiteral locationEnhancement = writeEntityEnhancement(
                             contentItemId, graph, literalFactory, result, entry.getValue(), null, null);
-                    log.info("  > " + score + " >= " + minHierarchyScore);
+                    log.debug("  > {}  >= {}",score,minHierarchyScore);
                     if (score != null && score >= minHierarchyScore) {
-                        log.info("  > getHierarchy for " + result.getGeoNameId() + " " + result.getName());
+                        log.debug("  > getHierarchy for {} {}",result.getGeoNameId(),result.getName());
                         //get the hierarchy
                         try {
                             Iterator<Toponym> hierarchy = getHierarchy(result).iterator();
@@ -409,7 +409,7 @@ public class LocationEnhancementEngine 
                                 if (result.getGeoNameId() != hierarchyEntry.getGeoNameId()) {
                                     //TODO: add additional checks based on possible
                                     //      configuration here!
-                                    log.info("    - write hierarchy " + hierarchyEntry.getGeoNameId() + " " + hierarchyEntry.getName());
+                                    log.debug("    - write hierarchy {} {}",hierarchyEntry.getGeoNameId(),hierarchyEntry.getName());
                                     /*
                                      * The hierarchy service dose not provide a score, because it would be 1.0
                                      * so we need to set the score to this value.

Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Thu Feb  2 13:52:27 2012
@@ -16,7 +16,6 @@
 */
 package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
 
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
 
 import java.io.IOException;
@@ -28,6 +27,7 @@ import java.util.Dictionary;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 
 import org.apache.clerezza.rdf.core.Language;
@@ -38,7 +38,6 @@ import org.apache.clerezza.rdf.core.Trip
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -66,17 +65,18 @@ import org.apache.stanbol.enhancer.engin
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
 import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.Entityhub;
-import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
 import org.apache.stanbol.entityhub.servicesapi.model.Reference;
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
@@ -132,6 +132,10 @@ public class KeywordLinkingEngine 
      */
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
+     * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+     */
+    protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+    /**
      * The default value for the Execution of this Engine.
      * This Engine creates TextAnnotations that should not be processed by other Engines.
      * Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
@@ -309,17 +313,11 @@ public class KeywordLinkingEngine 
 
     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
-        String mimeType = ci.getMimeType().split(";", 2)[0];
-        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
-            return ENHANCE_SYNCHRONOUS;
-        }
-        // check for existence of textual content in metadata
-        UriRef subj = ci.getUri();
-        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
-        if (it.hasNext()) {
-            return ENHANCE_SYNCHRONOUS;
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
+            return ENHANCE_ASYNC; //KeywordLinking now supports async processing
+        } else {
+            return CANNOT_ENHANCE;
         }
-        return CANNOT_ENHANCE;
     }
 
     @Override
@@ -327,28 +325,53 @@ public class KeywordLinkingEngine 
         if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
             throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
         }
-        String mimeType = ci.getMimeType().split(";", 2)[0];
-        String text = extractText(ci, mimeType);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+        if(contentPart == null){
+            throw new IllegalStateException("No ContentPart with a supported Mime Type"
+                    + "found for ContentItem "+ci.getUri()+"(supported: '"
+                    + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" 
+                    + "NOT called and indicates a bug in the used EnhancementJobManager!");
+        }
+        String text;
+        try {
+            text = ContentItemHelper.getText(contentPart.getValue());
+        } catch (IOException e) {
+            throw new InvalidContentException(String.format("Unable to extract "
+                +" text from ContentPart %s of ContentItem %s!",
+                contentPart.getKey(),ci.getUri()),e);
+        }
         if (text.trim().length() == 0) {
             // TODO: make the length of the data a field of the ContentItem
             // interface to be able to filter out empty items in the canEnhance
             // method
-            log.warn("nothing to extract knowledge from in ContentItem {}", ci);
+            log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
+                contentPart.getKey(), ci);
             return;
         }
         //Determine the language
-        String language = extractLanguage(ci);
+        String language;
+        ci.getLock().readLock().lock();
+        try {
+         language = extractLanguage(ci);
+        } finally {
+            ci.getLock().readLock().unlock();
+        }
         if(isProcessableLanguages(language)){
             log.debug("computeEnhancements for ContentItem {} language {} text={}", 
                 new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});
             
-            EntityLinker taxonomyLinker = new EntityLinker(
+            EntityLinker entityLinker = new EntityLinker(
                 analysedContentFactory.create(text, language),
                 entitySearcher, linkerConfig);
             //process
-            taxonomyLinker.process();
-            //write results
-            writeEnhancements(ci, taxonomyLinker.getLinkedEntities().values(), language);
+            entityLinker.process();
+            //write results (requires a write lock)
+            ci.getLock().writeLock().lock();
+            try {
+                writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
+            } finally {
+                ci.getLock().writeLock().unlock();
+            }
         } else {
             log.debug("ignore ContentItem {} because language '{}' is not configured to" +
             		"be processed by this engine.",ci.getUri().getUnicodeString(),language);
@@ -456,39 +479,6 @@ public class KeywordLinkingEngine 
         }
     }
 
-    /**
-     * Extracts the text from the parsed contentItem. In case the content type is
-     * plain text, it directly reads the text from the stream. In other cases it
-     * tries to read the string representation from the metadata by looking for
-     * values of the {@link org.apache.stanbol.enhancer.servicesapi.rdf.Properties#NIE_PLAINTEXTCONTENT}
-     * property.<p>
-     * TODO: This is a Workaround for the currently not implemented Adapter
-     * Pattern for the Stanbol Enhancer.
-     * @param ci
-     * @param mimeType
-     * @return
-     * @throws InvalidContentException
-     */
-    private String extractText(ContentItem ci, String mimeType) throws InvalidContentException {
-        String text;
-        if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
-            try {
-                text = IOUtils.toString(ci.getStream(),"UTF-8");
-            } catch (IOException e) {
-                throw new InvalidContentException(this, ci, e);
-            }
-        } else {
-            //TODO: change that as soon the Adapter Pattern is used for multiple
-            // mimetype support.
-            StringBuilder textBuilder = new StringBuilder();
-            Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getUri().getUnicodeString()), NIE_PLAINTEXTCONTENT, null);
-            while (it.hasNext()) {
-                textBuilder.append(it.next().getObject());
-            }
-            text = textBuilder.toString();
-        }
-        return text;
-    }
     
     /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      * Methods for activate() and deactivate() the properties configureable via

Modified: incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java Thu Feb  2 13:52:27 2012
@@ -17,30 +17,30 @@
 package org.apache.stanbol.enhancer.engines.langid;
 
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.IOException;
 import java.util.Collections;
 import java.util.Dictionary;
-import java.util.Iterator;
 import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
 
 import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Properties;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.tika.language.LanguageIdentifier;
 import org.osgi.service.cm.ConfigurationException;
@@ -81,6 +81,10 @@ public class LangIdEnhancementEngine 
      * This contains the only MIME type directly supported by this enhancement engine.
      */
     private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+    /**
+     * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+     */
+    private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
 
     /**
      * This contains the logger.
@@ -115,36 +119,31 @@ public class LangIdEnhancementEngine 
     }
 
     public int canEnhance(ContentItem ci) throws EngineException {
-        String mimeType = ci.getMimeType().split(";", 2)[0];
-        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
-            return ENHANCE_SYNCHRONOUS;
-        }
-
-        // TODO: check whether there is the graph contains the text
-        UriRef subj = ci.getUri();
-        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
-        if (it.hasNext()) {
-            return ENHANCE_SYNCHRONOUS;
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){
+            return ENHANCE_ASYNC; //Langid now supports async processing
+        } else {
+            return CANNOT_ENHANCE;
         }
-        return CANNOT_ENHANCE;
     }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+        if(contentPart == null){
+            throw new IllegalStateException("No ContentPart with Mimetype '"
+                    + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
+                    + ": This is also checked in the canEnhance method! -> This "
+                    + "indicated an Bug in the implementation of the "
+                    + "EnhancementJobManager!");
+        }
         String text = "";
-        if (TEXT_PLAIN_MIMETYPE.equals(ci.getMimeType())) {
-            try {
-                text = IOUtils.toString(ci.getStream(),"UTF-8");
-            } catch (IOException e) {
-                throw new InvalidContentException(this, ci, e);
-            }
-        } else {
-            Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null);
-            while (it.hasNext()) {
-                text += it.next().getObject();
-            }
+        try {
+            text = ContentItemHelper.getText(contentPart.getValue());
+        } catch (IOException e) {
+            throw new InvalidContentException(this, ci, e);
         }
         if (text.trim().length() == 0) {
-            log.warn("no text found");
+            log.info("No text contained in ContentPart {} of ContentItem {}",
+                contentPart.getKey(),ci.getUri());
             return;
         }
 
@@ -159,8 +158,13 @@ public class LangIdEnhancementEngine 
 
         // add language to metadata
         MGraph g = ci.getMetadata();
-        UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
-        g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
+        ci.getLock().writeLock().lock();
+        try {
+            UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+            g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
     }
 
     public int getProbeLength() {

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java Thu Feb  2 13:52:27 2012
@@ -16,20 +16,29 @@
  */
 package org.apache.stanbol.enhancer.engines.metaxa;
 
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
+
+import java.io.BufferedWriter;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.clerezza.rdf.core.BNode;
-import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
 import org.apache.felix.scr.annotations.Component;
@@ -39,12 +48,14 @@ import org.apache.stanbol.enhancer.engin
 import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
 import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryBlob;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.ontoware.aifbcommons.collection.ClosableIterator;
 import org.ontoware.rdf2go.model.Model;
@@ -54,6 +65,7 @@ import org.ontoware.rdf2go.model.node.Da
 import org.ontoware.rdf2go.model.node.Node;
 import org.ontoware.rdf2go.model.node.PlainLiteral;
 import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
 import org.osgi.framework.BundleContext;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
@@ -80,6 +92,11 @@ public class MetaxaEngine 
     private static final Logger log = LoggerFactory.getLogger(MetaxaEngine.class);
 
     /**
+     * Plain text content of a content item.
+      */
+    public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent");
+    private static final URIImpl NIE_PLAINTEXT_PROPERTY = new URIImpl(NIE_PLAINTEXTCONTENT.getUnicodeString());
+    /**
      * The default value for the Execution of this Engine. Currently set to
      * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
      */
@@ -97,6 +114,8 @@ public class MetaxaEngine 
     @Property(value=MetaxaEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY)
     public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";
 
+    @Property(value={"text/plain"},cardinality=1000)
+    public static final String IGNORE_MIME_TYPES = "org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";
     private MetaxaCore extractor;
     
     BundleContext bundleContext;
@@ -104,6 +123,8 @@ public class MetaxaEngine 
     public static final String DEFAULT_EXTRACTION_REGISTRY = "extractionregistry.xml";
     public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml";
     
+    private Set<String> ignoredMimeTypes;
+
     /**
      * The activate method.
      *
@@ -114,25 +135,38 @@ public class MetaxaEngine 
         super.activate(ce);
         String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
         String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
-        if (ce != null) {
-            this.bundleContext = ce.getBundleContext();
-            BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
-            try {
-                Dictionary<String, String> properties = ce.getProperties();
-                String confFile = properties.get(GLOBAL_EXTRACTOR_REGISTRY);
-                if (confFile != null && confFile.trim().length() > 0) {
-                    extractionRegistry = confFile;
-                }
-                confFile = properties.get(HTML_EXTRACTOR_REGISTRY);
-                if (confFile != null && confFile.trim().length() > 0) {
-                    htmlExtractors = confFile;
+        this.bundleContext = ce.getBundleContext();
+        BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
+        try {
+            Dictionary<String, Object> properties = ce.getProperties();
+            String confFile = (String)properties.get(GLOBAL_EXTRACTOR_REGISTRY);
+            if (confFile != null && confFile.trim().length() > 0) {
+                extractionRegistry = confFile;
+            }
+            confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY);
+            if (confFile != null && confFile.trim().length() > 0) {
+                htmlExtractors = confFile;
+            }
+            this.extractor = new MetaxaCore(extractionRegistry);
+            HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
+        } catch (IOException e) {
+            log.error(e.getLocalizedMessage(), e);
+            throw e;
+        }
+        Object value = ce.getProperties().get(IGNORE_MIME_TYPES);
+        if(value instanceof String[]){
+            ignoredMimeTypes = new HashSet<String>(Arrays.asList((String[])value));
+        } else if(value instanceof Iterable<?>){
+            ignoredMimeTypes = new HashSet<String>();
+            for(Object mimeType : (Iterable<?>)value){
+                if(mimeType != null){
+                    ignoredMimeTypes.add(mimeType.toString());
                 }
-                this.extractor = new MetaxaCore(extractionRegistry);
-                HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
-            } catch (IOException e) {
-                log.error(e.getLocalizedMessage(), e);
-                throw e;
             }
+        } else if(value != null && !value.toString().isEmpty()){
+            ignoredMimeTypes = Collections.singleton(value.toString());
+        } else {
+            ignoredMimeTypes = Collections.singleton("text/plain");
         }
     }
 
@@ -147,18 +181,26 @@ public class MetaxaEngine 
     }
 
     public int canEnhance(ContentItem ci) throws EngineException {
-        String mimeType = ci.getMimeType().split(";", 2)[0];
-        if (this.extractor.isSupported(mimeType)) {
-            return ENHANCE_SYNCHRONOUS;
+        String mimeType = ci.getMimeType();
+        if (!ignoredMimeTypes.contains(mimeType) && 
+                this.extractor.isSupported(mimeType)) {
+            return ENHANCE_ASYNC; //supports now asynchronous execution!
         }
         return CANNOT_ENHANCE;
     }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
-
         try {
             // get model from the extraction
-            Model m = this.extractor.extract(ci.getStream(), ci.getUri().getUnicodeString(), ci.getMimeType());
+            URIImpl docId;
+            Model m;
+            ci.getLock().readLock().lock();
+            try {
+                docId = new URIImpl(ci.getUri().getUnicodeString());
+                m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
+            } finally {
+                ci.getLock().readLock().unlock();
+            }
             // add the statements from this model to the Metadata model
             if (null != m) {
                 /*
@@ -166,28 +208,62 @@ public class MetaxaEngine 
                log.info(text);
                 */
                 // get the model where to add the statements
-                MGraph g = ci.getMetadata();
+                /*
+                 * NOTE(rweten): 
+                 *  There is no need to create an TextEnhancement to mark that
+                 *  a ContentItem was processed by Metaxa, because the
+                 *  ExecutionMetadata do record this anyway.
+                 */
+                //     
                 // create enhancement
-                UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                //UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
                 // set confidence value to 1.0
-                LiteralFactory literalFactory = LiteralFactory.getInstance();
-                g.add(new TripleImpl(textEnhancement, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
+                //g.add(new TripleImpl(textEnhancement, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
                 RDF2GoUtils.urifyBlankNodes(m);
                 HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
                 ClosableIterator<Statement> it = m.iterator();
+                ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+                Charset charset = Charset.forName("UTF-8");
+                BufferedWriter out = new BufferedWriter(new OutputStreamWriter(byteOut, charset));
+                MGraph g = new SimpleMGraph(); //first add to a temporary graph
                 while (it.hasNext()) {
                     Statement oneStmt = it.next();
-
-                    NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
-                    UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
-                    Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
-
-                    if (null != subject && null != predicate && null != object) {
-                        Triple t = new TripleImpl(subject, predicate, object);
-                        g.add(t);
-                        log.debug("added " + t.toString());
+                    //we need to treat triples that provide the plain/text
+                    //version differently. Such Objects need to be added to
+                    //the plain text Blob!
+                    if(oneStmt.getSubject().equals(docId) && 
+                            oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
+                        out.write(oneStmt.getObject().toString());
+                    } else { //add metadata to the metadata of the contentItem
+                        NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
+                        UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
+                        Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
+    
+                        if (null != subject && null != predicate && null != object) {
+                            Triple t = new TripleImpl(subject, predicate, object);
+                            g.add(t);
+                            log.debug("added " + t.toString());
+                        }
                     }
                 }
+                ci.getLock().writeLock().lock();
+                try { 
+                    //now acquire a write lock and add the extracted 
+                    //metadata to the content item
+                    ci.getMetadata().addAll(g);
+                } finally {
+                    ci.getLock().writeLock().unlock();
+                }
+                out.close();
+                byte[] plainTextData = byteOut.toByteArray();
+                if(plainTextData.length > 0){
+                    //add plain text to the content item
+                    UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());
+                    Blob plainTextBlob = new InMemoryBlob(plainTextData, 
+                        "text/plain;charset="+charset.toString());
+                    ci.addPart(blobUri, plainTextBlob);
+                    //TODO: add contentPart metadata to the contentItem
+                }
                 it.close();
                 m.close();
             }

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java Thu Feb  2 13:52:27 2012
@@ -102,7 +102,7 @@ public class MetaxaCore {
      *             if there is an error when reading the input stream
      */
     public Model extract(
-            InputStream in, String docId, String mimeType)
+            InputStream in, URIImpl docId, String mimeType)
             throws ExtractorException, IOException {
 
         @SuppressWarnings("rawtypes")
@@ -116,7 +116,7 @@ public class MetaxaCore {
             RDFContainerFactory containerFactory =
                 new RDFContainerFactoryImpl();
             RDFContainer container =
-                containerFactory.getRDFContainer(new URIImpl(docId));
+                containerFactory.getRDFContainer(docId);
             extractor.extract(
                 container.getDescribedUri(),
                 new BufferedInputStream(in, 8192),

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties Thu Feb  2 13:52:27 2012
@@ -37,4 +37,8 @@ of a resource on the bundle classpath th
 
 org.apache.stanbol.enhancer.engines.metaxa.htmlextractors.name=Html Extractors
 org.apache.stanbol.enhancer.engines.metaxa.htmlextractors.description=The path of a \
-resource on the bundle classpath that specifies which extractors are used for HTML pages.
\ No newline at end of file
+resource on the bundle classpath that specifies which extractors are used for HTML pages.
+
+org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.name=Ignored Mime Types
+org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.description=This allows to \
+provide a list of MIME TYPES that are not processed by this engine.
\ No newline at end of file

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java Thu Feb  2 13:52:27 2012
@@ -35,6 +35,7 @@ import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.Statement;
 import org.ontoware.rdf2go.model.node.BlankNode;
 import org.ontoware.rdf2go.model.node.Variable;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.semanticdesktop.aperture.vocabulary.NMO;
 import org.slf4j.Logger;
@@ -88,7 +89,7 @@ public class TestMetaxaCore {
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);
 
-        Model m = extractor.extract(in, "file://" + testFile, "application/pdf");
+        Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
@@ -119,7 +120,7 @@ public class TestMetaxaCore {
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);
 
-        Model m = extractor.extract(in, "file://" + testFile, "text/html");
+        Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
@@ -149,7 +150,7 @@ public class TestMetaxaCore {
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);
 
-        Model m = extractor.extract(in, "file://" + testFile, "text/html");
+        Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
@@ -169,7 +170,7 @@ public class TestMetaxaCore {
       String testFile = "mail-multipart-test.eml";
       InputStream in = getResourceAsStream(testFile);
       assertNotNull("failed to load resource " + testFile, in);
-      Model m = extractor.extract(in, "file://" + testFile, "message/rfc822");
+      Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
       boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
       assertTrue(textContained);
     }

Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java Thu Feb  2 13:52:27 2012
@@ -23,7 +23,6 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
@@ -44,9 +43,11 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.Dictionary;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
 
 import org.apache.clerezza.rdf.core.Graph;
 import org.apache.clerezza.rdf.core.Literal;
@@ -73,12 +74,14 @@ import org.apache.felix.scr.annotations.
 import org.apache.felix.scr.annotations.Reference;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.osgi.framework.BundleContext;
 import org.osgi.service.cm.ConfigurationException;
@@ -108,9 +111,10 @@ public class OpenCalaisEngine 
 
     /**
      * This contains the directly supported MIME types of this enhancement engine.
-     * For handling other mime-types the plain text must be contained in the metadata as by Metaxa.
      */
-    protected static final List<String> SUPPORTED_MIMETYPES = Arrays.asList(new String[]{"text/plain", "text/html"});
+    protected static final Set<String> SUPPORTED_MIMETYPES = 
+            Collections.unmodifiableSet(new HashSet<String>(
+                    Arrays.asList("text/plain", "text/html")));
 
     /**
      * This contains a list of languages supported by OpenCalais.
@@ -118,7 +122,9 @@ public class OpenCalaisEngine 
      * it is left to the grace of the OpenCalais whether it accepts the text.
      * OpenCalais uses its own language identifcation anyway.
      */
-    protected static final List<String> SUPPORTED_LANGUAGES = Arrays.asList(new String[]{"en", "fr", "es"});
+    protected static final Set<String> SUPPORTED_LANGUAGES = 
+            Collections.unmodifiableSet(new HashSet<String>(
+                    Arrays.asList("en", "fr", "es")));
 
     /**
      * The default value for the Execution of this Engine. Currently set to
@@ -248,62 +254,53 @@ public class OpenCalaisEngine 
     }
 
     public int canEnhance(ContentItem ci) throws EngineException {
-        //Engine will no longer activate if no license key is set
-//        if (getLicenseKey() == null || getLicenseKey().trim().length() == 0) {
-//            //do nothing if no license key is defined
-//            log.warn("No license key defined. The engine will not work!");
-//            return CANNOT_ENHANCE;
-//        }
-        UriRef subj = ci.getUri();
-        String mimeType = ci.getMimeType().split(";", 2)[0];
-        if (SUPPORTED_MIMETYPES.contains(mimeType.toLowerCase())) {
-            // check language
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
             String language = getMetadataLanguage(ci.getMetadata(), null);
             if (language != null && !SUPPORTED_LANGUAGES.contains(language)) {
-                log.warn("Wrong language for Calais: {}", language);
+                log.info("OpenCalais can not process ContentItem {} because "
+                    + "language {} is not supported (supported: {})",
+                    new Object[]{ci.getUri(),language,SUPPORTED_LANGUAGES});
                 return CANNOT_ENHANCE;
             }
-            return ENHANCE_SYNCHRONOUS;
-        } else {
-            // TODO: check whether the metadata graph contains the text
-            Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
-            if (it.hasNext()) {
-                return ENHANCE_SYNCHRONOUS;
-            }
-        }
+            return ENHANCE_ASYNC; //OpenCalais now support async processing!
+        } 
         return CANNOT_ENHANCE;
     }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
-        String mimeType = ci.getMimeType().split(";", 2)[0].toLowerCase();
-        String text = "";
-        if (SUPPORTED_MIMETYPES.contains(mimeType)) {
-            try {
-                text = IOUtils.toString(ci.getStream(),"UTF-8");
-            } catch (IOException e) {
-                throw new InvalidContentException(this, ci, e);
-            }
-        } else {
-            mimeType = "text/plain";
-            text = getMetadataText(ci.getMetadata(), ci.getUri());
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+        if(contentPart == null){
+            throw new IllegalStateException("No ContentPart with an supported Mimetype '"
+                    + SUPPORTED_MIMETYPES+"' found for ContentItem "+ci.getUri()
+                    + ": This is also checked in the canEnhance method! -> This "
+                    + "indicated an Bug in the implementation of the "
+                    + "EnhancementJobManager!");
         }
-        if (text == null) {
-            log.warn("no text found");
-            return;
+        String text;
+        try {
+            text = ContentItemHelper.getText(contentPart.getValue());
+        } catch (IOException e) {
+            throw new InvalidContentException(this, ci, e);
         }
 
-        MGraph calaisModel = getCalaisAnalysis(text, mimeType);
+        MGraph calaisModel = getCalaisAnalysis(text, contentPart.getValue().getMimeType());
         if (calaisModel != null) {
-            createEnhancements(queryModel(calaisModel), ci);
-            if (log.isDebugEnabled()) {
-              Serializer serializer = Serializer.getInstance();
-              ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
-              serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
-              try {
-                log.debug("Calais Enhancements:\n{}",debugStream.toString("UTF-8"));
-              } catch (UnsupportedEncodingException e) {
-                e.printStackTrace();
-              }
+            //Acquire a write lock on the ContentItem when adding the enhancements
+            ci.getLock().writeLock().lock();
+            try {
+                createEnhancements(queryModel(calaisModel), ci);
+                if (log.isDebugEnabled()) {
+                    Serializer serializer = Serializer.getInstance();
+                    ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+                    serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
+                    try {
+                        log.debug("Calais Enhancements:\n{}",debugStream.toString("UTF-8"));
+                    } catch (UnsupportedEncodingException e) {
+                        e.printStackTrace();
+                    }
+                }
+            } finally {
+                ci.getLock().writeLock().unlock();
             }
         }
 
@@ -587,17 +584,6 @@ public class OpenCalaisEngine 
                 urlConn.getInputStream(), responseEncoding);
     }
 
-    public String getMetadataText(MGraph model, NonLiteral subj) {
-        String text = "";
-        for (Iterator<Triple> it = model.filter(subj, NIE_PLAINTEXTCONTENT, null); it.hasNext();) {
-            text += getLexicalForm(it.next().getObject());
-        }
-        if (text.trim().length() > 0) {
-            return text;
-        }
-        return null;
-    }
-
     public String getMetadataLanguage(MGraph model, NonLiteral subj) {
         Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
         if (it.hasNext()) {
@@ -624,21 +610,19 @@ public class OpenCalaisEngine 
      */
     protected void activate(ComponentContext ce) throws ConfigurationException {
         super.activate(ce);
-        if (ce != null) {
-            this.bundleContext = ce.getBundleContext();
-            //TODO initialize Extractor
-            Dictionary<String, String> properties = ce.getProperties();
-            String license = properties.get(LICENSE_KEY);
-            String url = properties.get(CALAIS_URL_KEY);
-            calaisTypeMapFile = properties.get(CALAIS_TYPE_MAP_KEY);
-            String standAlone = properties.get(CALAIS_NER_ONLY_MODE_KEY);
-            setLicenseKey(license);
-            setCalaisUrl(url);
-            calaisTypeMap = new HashMap<UriRef,UriRef>();
-            loadTypeMap(calaisTypeMapFile);
-            onlyNERMode = Boolean.parseBoolean(standAlone);
-            //      this.tcManager = TcManager.getInstance();
-        }
+        this.bundleContext = ce.getBundleContext();
+        //TODO initialize Extractor
+        Dictionary<String, Object> properties = ce.getProperties();
+        String license = (String)properties.get(LICENSE_KEY);
+        String url = (String)properties.get(CALAIS_URL_KEY);
+        calaisTypeMapFile = (String)properties.get(CALAIS_TYPE_MAP_KEY);
+        String standAlone = (String)properties.get(CALAIS_NER_ONLY_MODE_KEY);
+        setLicenseKey(license);
+        setCalaisUrl(url);
+        calaisTypeMap = new HashMap<UriRef,UriRef>();
+        loadTypeMap(calaisTypeMapFile);
+        onlyNERMode = Boolean.parseBoolean(standAlone);
+        //      this.tcManager = TcManager.getInstance();
     }
 
     /**