You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/02/02 14:52:28 UTC
svn commit: r1239618 [1/2] - in /incubator/stanbol/trunk/enhancer:
engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/
engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/
engin...
Author: rwesten
Date: Thu Feb 2 13:52:27 2012
New Revision: 1239618
URL: http://svn.apache.org/viewvc?rev=1239618&view=rev
Log:
STANBOL-478: Metaxa Engine no adds the extracted plain text as Blob to the ContentItem. All other engines retrieve content from Blobs.
STANBOL-479: All current (and not deprecated) Engines do now support async processing
STANBOL-46: Asynchronous enhancements are now finally supported by both the EnhancementJobManager and the EnhancementEngines.
Other changes:
* Added two utility methods to the ContentItemHelper that allow to search for Blobs based on mime types and to retrieve the text from a Blob.
* Metaxa Engine allows now to configure a list of mime types that are ingored. This is basically to avaid processing of plain/text content, but could also be used by users to deactivate other mime types.
* geonames Engine: changed most of the logging to the level debug. In addition this engine does no use the correct logger
Added:
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java (with props)
Modified:
incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java
incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java
incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java
incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java
Modified: incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java Thu Feb 2 13:52:27 2012
@@ -17,20 +17,18 @@
package org.apache.stanbol.enhancer.engines.autotagging.impl;
import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.PROPERTY_NAME;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
-import java.util.Iterator;
import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
@@ -39,11 +37,13 @@ import org.apache.felix.scr.annotations.
import org.apache.stanbol.autotagging.Autotagger;
import org.apache.stanbol.autotagging.TagInfo;
import org.apache.stanbol.enhancer.engines.autotagging.AutotaggerProvider;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -64,7 +64,7 @@ import org.slf4j.LoggerFactory;
public class RelatedTopicEnhancementEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements EnhancementEngine {
protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
-
+ protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
public static final String DEFAULT_NAME = "autotaggingRelatedTopic";
private static final Logger log = LoggerFactory.getLogger(RelatedTopicEnhancementEngine.class);
@@ -87,25 +87,25 @@ public class RelatedTopicEnhancementEngi
+ ci.getUri().getUnicodeString());
return;
}
- String mimeType = ci.getMimeType().split(";", 2)[0];
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with a supported Mime Type"
+ + "found for ContentItem "+ci.getUri()+"(supported: '"
+ + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was"
+ + "NOT called and indicates a bug in the used EnhancementJobManager!");
+ }
String text = "";
- if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
- try {
- text = IOUtils.toString(ci.getStream(),"UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- } else {
- Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getUri().getUnicodeString()), NIE_PLAINTEXTCONTENT, null);
- while (it.hasNext()) {
- text += it.next().getObject();
- }
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
- log.warn("nothing to extract a topic from");
+ log.warn("ContentPart {} of ContentItem {} does contain no text to extract a topic from",
+ contentPart.getKey(),ci.getUri());
return;
}
@@ -115,10 +115,16 @@ public class RelatedTopicEnhancementEngi
try {
List<TagInfo> suggestions = autotagger.suggestForType(text, type);
Collection<NonLiteral> noRelatedEnhancements = Collections.emptyList();
- for (TagInfo tag : suggestions) {
- EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory,
- graph, contentItemId,
- noRelatedEnhancements, tag);
+ //Acquire a write lock while writing the enhancement results
+ ci.getLock().writeLock().lock();
+ try {
+ for (TagInfo tag : suggestions) {
+ EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory,
+ graph, contentItemId,
+ noRelatedEnhancements, tag);
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
}
} catch (IOException e) {
throw new EngineException(this, ci, e);
@@ -126,17 +132,11 @@ public class RelatedTopicEnhancementEngi
}
public int canEnhance(ContentItem ci) {
- String mimeType = ci.getMimeType().split(";",2)[0];
- if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
- return ENHANCE_SYNCHRONOUS;
- }
- // check for existence of textual content in metadata
- UriRef subj = new UriRef(ci.getUri().getUnicodeString());
- Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
- if (it.hasNext()) {
- return ENHANCE_SYNCHRONOUS;
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
+ return ENHANCE_ASYNC; //RelatedTopic engine now supports async processing
+ } else {
+ return CANNOT_ENHANCE;
}
- return CANNOT_ENHANCE;
}
public void bindAutotaggerProvider(AutotaggerProvider autotaggerProvider) {
Added: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java?rev=1239618&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java Thu Feb 2 13:52:27 2012
@@ -0,0 +1,93 @@
+package org.apache.stanbol.enhancer.engines.entitytagging.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+
+import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.TripleCollection;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public final class NamedEntity {
+ private static final Logger log = LoggerFactory.getLogger(NamedEntity.class);
+ private final NonLiteral entity;
+ private final String name;
+ private final UriRef type;
+ private NamedEntity(NonLiteral entity, String name, UriRef type) {
+ this.entity = entity;
+ this.name = name;
+ this.type = type;
+ }
+ /**
+ * Getter for the Node providing the information about that entity
+ * @return the entity
+ */
+ public final NonLiteral getEntity() {
+ return entity;
+ }
+ /**
+ * Getter for the name
+ * @return the name
+ */
+ public final String getName() {
+ return name;
+ }
+ /**
+ * Getter for the type
+ * @return the type
+ */
+ public final UriRef getType() {
+ return type;
+ }
+ @Override
+ public int hashCode() {
+ return entity.hashCode();
+ }
+ @Override
+ public boolean equals(Object o) {
+ return o instanceof NamedEntity && entity.equals(((NamedEntity)o).entity);
+ }
+ @Override
+ public String toString() {
+ return String.format("NamedEntity %s (name=%s|type=%s)",entity,name,type);
+ }
+ /**
+ * Extracts the information of an {@link NamedEntity} from an
+ * {@link TechnicalClasses#ENHANCER_TEXTANNOTATION} instance.
+ * @param graph the graph with the information
+ * @param textAnnotation the text annotation instance
+ * @return the {@link NamedEntity} or <code>null</code> if the parsed
+ * text annotation is missing required information.
+ */
+ public static NamedEntity createFromTextAnnotation(TripleCollection graph, NonLiteral textAnnotation){
+ String name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT);
+ if (name == null) {
+ log.debug("Unable to create NamedEntity for TextAnnotation {} "
+ + "because property {} is not present",textAnnotation,ENHANCER_SELECTED_TEXT);
+ return null;
+ }
+ name = name.trim();
+ if(name.isEmpty()){
+ log.debug("Unable to process TextAnnotation {} because its selects "
+ + "an empty Stirng !",textAnnotation);
+ return null;
+ }
+ UriRef type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE);
+ if (type == null) {
+ log.warn("Unable to process TextAnnotation {} because property {}"
+ + " is not present!",textAnnotation, DC_TYPE);
+ return null;
+ }
+ // remove punctuation form the search string
+ return new NamedEntity(textAnnotation,cleanupKeywords(name),type);
+ }
+ /**
+ * Removes punctuation form a parsed string
+ */
+ private static String cleanupKeywords(String keywords) {
+ return keywords.replaceAll("\\p{P}", " ").trim();
+ }
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntity.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/entitytagging/src/main/java/org/apache/stanbol/enhancer/engines/entitytagging/impl/NamedEntityTaggingEngine.java Thu Feb 2 13:52:27 2012
@@ -17,8 +17,6 @@
package org.apache.stanbol.enhancer.engines.entitytagging.impl;
import static org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import java.util.ArrayList;
@@ -28,12 +26,12 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Triple;
-import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -52,7 +50,6 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
@@ -318,36 +315,72 @@ public class NamedEntityTaggingEngine
} else { // null indicates to use the Entityhub to lookup Entities
site = null;
}
- UriRef contentItemId = ci.getUri();
-
MGraph graph = ci.getMetadata();
LiteralFactory literalFactory = LiteralFactory.getInstance();
-
- // Retrieve the existing text annotations
- Map<UriRef,List<UriRef>> textAnnotations = new HashMap<UriRef,List<UriRef>>();
- for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it
- .hasNext();) {
- UriRef uri = (UriRef) it.next().getSubject();
- if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
- // this is not the most specific occurrence of this name: skip
- continue;
- }
- // This is a first occurrence, collect any subsumed annotations
- List<UriRef> subsumed = new ArrayList<UriRef>();
- for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext();) {
- subsumed.add((UriRef) it2.next().getSubject());
+ // Retrieve the existing text annotations (requires read lock)
+ Map<NamedEntity,List<UriRef>> textAnnotations = new HashMap<NamedEntity,List<UriRef>>();
+ ci.getLock().readLock().lock();
+ try {
+ for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it
+ .hasNext();) {
+ UriRef uri = (UriRef) it.next().getSubject();
+ if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
+ // this is not the most specific occurrence of this name: skip
+ continue;
+ }
+ NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
+ if(namedEntity != null){
+ // This is a first occurrence, collect any subsumed annotations
+ List<UriRef> subsumed = new ArrayList<UriRef>();
+ for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext();) {
+ subsumed.add((UriRef) it2.next().getSubject());
+ }
+ textAnnotations.put(namedEntity, subsumed);
+ }
}
- textAnnotations.put(uri, subsumed);
+ } finally {
+ ci.getLock().readLock().unlock();
}
-
- for (Map.Entry<UriRef,List<UriRef>> entry : textAnnotations.entrySet()) {
+ //search the suggestions
+ Map<NamedEntity,List<Entity>> suggestions = new HashMap<NamedEntity,List<Entity>>(textAnnotations.size());
+ for (Entry<NamedEntity,List<UriRef>> entry : textAnnotations.entrySet()) {
try {
- computeEntityRecommentations(site,literalFactory, graph, contentItemId, entry.getKey(),
- entry.getValue());
+ List<Entity> entitySuggestions = computeEntityRecommentations(
+ site, entry.getKey(),entry.getValue());
+ if(entitySuggestions != null && !entitySuggestions.isEmpty()){
+ suggestions.put(entry.getKey(), entitySuggestions);
+ }
} catch (EntityhubException e) {
throw new EngineException(this, ci, e);
}
}
+ //now write the results (requires write lock)
+ ci.getLock().writeLock().lock();
+ try {
+ RdfValueFactory factory = RdfValueFactory.getInstance();
+ Map<String, Representation> entityData = new HashMap<String,Representation>();
+ for(Entry<NamedEntity,List<Entity>> entitySuggestions : suggestions.entrySet()){
+ List<UriRef> subsumed = textAnnotations.get(entitySuggestions.getKey());
+ List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>(subsumed);
+ annotationsToRelate.add(entitySuggestions.getKey().getEntity());
+ for(Entity suggestion : entitySuggestions.getValue()){
+ log.debug("Add Suggestion {} for {}", suggestion.getId(), entitySuggestions.getKey());
+ EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(),
+ annotationsToRelate, suggestion.getRepresentation(), nameField);
+ if (dereferenceEntities) {
+ entityData.put(suggestion.getId(), suggestion.getRepresentation());
+ }
+ }
+ }
+ //if dereferneceEntities is true the entityData will also contain all
+ //Representations to add! If false entityData will be empty
+ for(Representation rep : entityData.values()){
+ graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+
}
/**
@@ -355,7 +388,6 @@ public class NamedEntityTaggingEngine
* @param site The {@link ReferencedSiteException} id or <code>null</code> to
* use the {@link Entityhub}
* @param literalFactory the {@link LiteralFactory} used to create RDF Literals
- * @param graph the graph to write the lined entities
* @param contentItemId the id of the contentItem
* @param textAnnotation the text annotation to enhance
* @param subsumedAnnotations other text annotations for the same entity
@@ -363,42 +395,19 @@ public class NamedEntityTaggingEngine
* @throws EntityhubException On any Error while looking up Entities via
* the Entityhub
*/
- protected final Iterable<Entity> computeEntityRecommentations(ReferencedSite site,
- LiteralFactory literalFactory,
- MGraph graph,
- UriRef contentItemId,
- UriRef textAnnotation,
+ protected final List<Entity> computeEntityRecommentations(ReferencedSite site,
+ NamedEntity namedEntity,
List<UriRef> subsumedAnnotations) throws EntityhubException {
// First get the required properties for the parsed textAnnotation
// ... and check the values
- String name = EnhancementEngineHelper.getString(graph, textAnnotation, ENHANCER_SELECTED_TEXT);
- if (name == null) {
- log.info("Unable to process TextAnnotation " + textAnnotation + " because property"
- + ENHANCER_SELECTED_TEXT + " is not present");
- return Collections.emptyList();
- }
- if(name.isEmpty()){
- log.info("Unable to process TextAnnotation " + textAnnotation +
- " because an empty Stirng is selected by " + ENHANCER_SELECTED_TEXT + "");
- return Collections.emptyList();
- }
-
- UriRef type = EnhancementEngineHelper.getReference(graph, textAnnotation, DC_TYPE);
- if (type == null) {
- log.warn("Unable to process TextAnnotation " + textAnnotation + " because property" + DC_TYPE
- + " is not present");
- return Collections.emptyList();
- }
- // remove punctuation form the search string
- name = cleanupKeywords(name);
- log.debug("Process TextAnnotation " + name + " type=" + type);
+ log.debug("Process {}", namedEntity);
FieldQuery query = site == null ? //if site is NULL use the Entityhub
entityhub.getQueryFactory().createFieldQuery() :
site.getQueryFactory().createFieldQuery();
// replace spaces with plus to create an AND search for all words in the name!
- query.setConstraint(nameField, new TextConstraint(name));// name.replace(' ', '+')));
- if (OntologicalClasses.DBPEDIA_PERSON.equals(type)) {
+ query.setConstraint(nameField, new TextConstraint(namedEntity.getName()));// name.replace(' ', '+')));
+ if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
if (personState) {
if (personType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType));
@@ -408,7 +417,7 @@ public class NamedEntityTaggingEngine
// ignore people
return Collections.emptyList();
}
- } else if (DBPEDIA_ORGANISATION.equals(type)) {
+ } else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) {
if (orgState) {
if (orgType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType));
@@ -418,7 +427,7 @@ public class NamedEntityTaggingEngine
// ignore people
return Collections.emptyList();
}
- } else if (OntologicalClasses.DBPEDIA_PLACE.equals(type)) {
+ } else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) {
if (this.placeState) {
if (this.placeType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType));
@@ -435,9 +444,6 @@ public class NamedEntityTaggingEngine
site.findEntities(query); //else the referenced site
log.debug("{} results returned by query {}", results.size(), query);
- List<NonLiteral> annotationsToRelate = new ArrayList<NonLiteral>();
- annotationsToRelate.add(textAnnotation);
- annotationsToRelate.addAll(subsumedAnnotations);
Float maxScore = null;
int exactCount = 0;
List<Entity> matches = new ArrayList<Entity>(numSuggestions);
@@ -452,7 +458,7 @@ public class NamedEntityTaggingEngine
while(labels.hasNext() && !found){
Text label = labels.next();
if(label.getLanguage() == null || label.getLanguage().startsWith("en")){
- if(label.getText().equalsIgnoreCase(name)){
+ if(label.getText().equalsIgnoreCase(namedEntity.getName())){
found = true;
}
}
@@ -464,7 +470,6 @@ public class NamedEntityTaggingEngine
matches.add(guess);
}
}
- RdfValueFactory factory = RdfValueFactory.getInstance();
//now write the results
for(int i=0;i<matches.size();i++){
Representation rep = matches.get(i).getRepresentation();
@@ -477,15 +482,8 @@ public class NamedEntityTaggingEngine
maxScore.doubleValue()+(score != null?score.doubleValue():0));
}
}
- log.debug("Adding {} to ContentItem {}", rep.getId(), contentItemId);
- EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, contentItemId,
- annotationsToRelate, rep, nameField);
-
- if (dereferenceEntities) {
- graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
- }
}
- return results;
+ return matches;
}
public int canEnhance(ContentItem ci) {
@@ -493,7 +491,7 @@ public class NamedEntityTaggingEngine
* This engine consumes existing enhancements because of that it can enhance any type of ci! TODO: It
* would also be possible to check here if there is an TextAnnotation and use that as result!
*/
- return ENHANCE_SYNCHRONOUS;
+ return ENHANCE_ASYNC; //Entity tagging now supports asyc processing
}
@Override
@@ -502,10 +500,4 @@ public class NamedEntityTaggingEngine
(Object) defaultOrder));
}
- /**
- * Removes punctuation form a parsed string
- */
- private static String cleanupKeywords(String keywords) {
- return keywords.replaceAll("\\p{P}", " ").trim();
- }
}
Modified: incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/geonames/src/main/java/org/apache/stanbol/enhancer/engines/geonames/impl/LocationEnhancementEngine.java Thu Feb 2 13:52:27 2012
@@ -99,7 +99,7 @@ public class LocationEnhancementEngine
public static final Map<String, Collection<UriRef>> FEATURE_TYPE_CONCEPT_MAPPINGS;
- private static final Logger log = LoggerFactory.getLogger(EnhancementEngineHelper.class);
+ private static final Logger log = LoggerFactory.getLogger(LocationEnhancementEngine.class);
/**
* Default value for minimum scores of search results are added to the
@@ -372,9 +372,9 @@ public class LocationEnhancementEngine
}
if (results != null) {
for (Toponym result : results) {
- log.info("process result " + result.getGeoNameId() + " " + result.getName());
+ log.debug("process result {} {}",result.getGeoNameId(),result.getName());
Double score = getToponymScore(result);
- log.info(" > score " + score);
+ log.debug(" > score {}",score);
if (score != null) {
if (score < minScore) {
//if score is lower than the under bound, than stop
@@ -391,9 +391,9 @@ public class LocationEnhancementEngine
//write the enhancement!
NonLiteral locationEnhancement = writeEntityEnhancement(
contentItemId, graph, literalFactory, result, entry.getValue(), null, null);
- log.info(" > " + score + " >= " + minHierarchyScore);
+ log.debug(" > {} >= {}",score,minHierarchyScore);
if (score != null && score >= minHierarchyScore) {
- log.info(" > getHierarchy for " + result.getGeoNameId() + " " + result.getName());
+ log.debug(" > getHierarchy for {} {}",result.getGeoNameId(),result.getName());
//get the hierarchy
try {
Iterator<Toponym> hierarchy = getHierarchy(result).iterator();
@@ -409,7 +409,7 @@ public class LocationEnhancementEngine
if (result.getGeoNameId() != hierarchyEntry.getGeoNameId()) {
//TODO: add additional checks based on possible
// configuration here!
- log.info(" - write hierarchy " + hierarchyEntry.getGeoNameId() + " " + hierarchyEntry.getName());
+ log.debug(" - write hierarchy {} {}",hierarchyEntry.getGeoNameId(),hierarchyEntry.getName());
/*
* The hierarchy service dose not provide a score, because it would be 1.0
* so we need to set the score to this value.
Modified: incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java Thu Feb 2 13:52:27 2012
@@ -16,7 +16,6 @@
*/
package org.apache.stanbol.enhancer.engines.keywordextraction.engine;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
import static org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum.getFullName;
import java.io.IOException;
@@ -28,6 +27,7 @@ import java.util.Dictionary;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.Language;
@@ -38,7 +38,6 @@ import org.apache.clerezza.rdf.core.Trip
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -66,17 +65,18 @@ import org.apache.stanbol.enhancer.engin
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.OpenNlpAnalysedContentFactory;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.ReferencedSiteSearcher;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.impl.TrackingEntitySearcher;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory;
import org.apache.stanbol.entityhub.servicesapi.Entityhub;
-import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
import org.apache.stanbol.entityhub.servicesapi.model.Reference;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.site.ReferencedSite;
@@ -132,6 +132,10 @@ public class KeywordLinkingEngine
*/
protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
/**
+ * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+ /**
* The default value for the Execution of this Engine.
* This Engine creates TextAnnotations that should not be processed by other Engines.
* Therefore it uses a lower rank than {@link ServiceProperties#ORDERING_DEFAULT}
@@ -309,17 +313,11 @@ public class KeywordLinkingEngine
@Override
public int canEnhance(ContentItem ci) throws EngineException {
- String mimeType = ci.getMimeType().split(";", 2)[0];
- if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
- return ENHANCE_SYNCHRONOUS;
- }
- // check for existence of textual content in metadata
- UriRef subj = ci.getUri();
- Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
- if (it.hasNext()) {
- return ENHANCE_SYNCHRONOUS;
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
+ return ENHANCE_ASYNC; //KeywordLinking now supports async processing
+ } else {
+ return CANNOT_ENHANCE;
}
- return CANNOT_ENHANCE;
}
@Override
@@ -327,28 +325,53 @@ public class KeywordLinkingEngine
if(isOfflineMode() && !entitySearcher.supportsOfflineMode()){
throw new EngineException("Offline mode is not supported by the Component used to lookup Entities");
}
- String mimeType = ci.getMimeType().split(";", 2)[0];
- String text = extractText(ci, mimeType);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with a supported Mime Type"
+ + "found for ContentItem "+ci.getUri()+"(supported: '"
+ + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was"
+ + "NOT called and indicates a bug in the used EnhancementJobManager!");
+ }
+ String text;
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(String.format("Unable to extract "
+ +" text from ContentPart %s of ContentItem %s!",
+ contentPart.getKey(),ci.getUri()),e);
+ }
if (text.trim().length() == 0) {
// TODO: make the length of the data a field of the ContentItem
// interface to be able to filter out empty items in the canEnhance
// method
- log.warn("nothing to extract knowledge from in ContentItem {}", ci);
+ log.warn("ContentPart {} of ContentItem does not contain any Text to extract knowledge from",
+ contentPart.getKey(), ci);
return;
}
//Determine the language
- String language = extractLanguage(ci);
+ String language;
+ ci.getLock().readLock().lock();
+ try {
+ language = extractLanguage(ci);
+ } finally {
+ ci.getLock().readLock().unlock();
+ }
if(isProcessableLanguages(language)){
log.debug("computeEnhancements for ContentItem {} language {} text={}",
new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)});
- EntityLinker taxonomyLinker = new EntityLinker(
+ EntityLinker entityLinker = new EntityLinker(
analysedContentFactory.create(text, language),
entitySearcher, linkerConfig);
//process
- taxonomyLinker.process();
- //write results
- writeEnhancements(ci, taxonomyLinker.getLinkedEntities().values(), language);
+ entityLinker.process();
+ //write results (requires a write lock)
+ ci.getLock().writeLock().lock();
+ try {
+ writeEnhancements(ci, entityLinker.getLinkedEntities().values(), language);
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
} else {
log.debug("ignore ContentItem {} because language '{}' is not configured to" +
"be processed by this engine.",ci.getUri().getUnicodeString(),language);
@@ -456,39 +479,6 @@ public class KeywordLinkingEngine
}
}
- /**
- * Extracts the text from the parsed contentItem. In case the content type is
- * plain text, it directly reads the text from the stream. In other cases it
- * tries to read the string representation from the metadata by looking for
- * values of the {@link org.apache.stanbol.enhancer.servicesapi.rdf.Properties#NIE_PLAINTEXTCONTENT}
- * property.<p>
- * TODO: This is a Workaround for the currently not implemented Adapter
- * Pattern for the Stanbol Enhancer.
- * @param ci
- * @param mimeType
- * @return
- * @throws InvalidContentException
- */
- private String extractText(ContentItem ci, String mimeType) throws InvalidContentException {
- String text;
- if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
- try {
- text = IOUtils.toString(ci.getStream(),"UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- } else {
- //TODO: change that as soon the Adapter Pattern is used for multiple
- // mimetype support.
- StringBuilder textBuilder = new StringBuilder();
- Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getUri().getUnicodeString()), NIE_PLAINTEXTCONTENT, null);
- while (it.hasNext()) {
- textBuilder.append(it.next().getObject());
- }
- text = textBuilder.toString();
- }
- return text;
- }
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* Methods for activate() and deactivate() the properties configureable via
Modified: incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java Thu Feb 2 13:52:27 2012
@@ -17,30 +17,30 @@
package org.apache.stanbol.enhancer.engines.langid;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
import java.io.IOException;
import java.util.Collections;
import java.util.Dictionary;
-import java.util.Iterator;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.tika.language.LanguageIdentifier;
import org.osgi.service.cm.ConfigurationException;
@@ -81,6 +81,10 @@ public class LangIdEnhancementEngine
* This contains the only MIME type directly supported by this enhancement engine.
*/
private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
/**
* This contains the logger.
@@ -115,36 +119,31 @@ public class LangIdEnhancementEngine
}
public int canEnhance(ContentItem ci) throws EngineException {
- String mimeType = ci.getMimeType().split(";", 2)[0];
- if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
- return ENHANCE_SYNCHRONOUS;
- }
-
- // TODO: check whether there is the graph contains the text
- UriRef subj = ci.getUri();
- Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
- if (it.hasNext()) {
- return ENHANCE_SYNCHRONOUS;
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){
+ return ENHANCE_ASYNC; //Langid now supports async processing
+ } else {
+ return CANNOT_ENHANCE;
}
- return CANNOT_ENHANCE;
}
public void computeEnhancements(ContentItem ci) throws EngineException {
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
String text = "";
- if (TEXT_PLAIN_MIMETYPE.equals(ci.getMimeType())) {
- try {
- text = IOUtils.toString(ci.getStream(),"UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- } else {
- Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null);
- while (it.hasNext()) {
- text += it.next().getObject();
- }
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
- log.warn("no text found");
+ log.info("No text contained in ContentPart {} of ContentItem {}",
+ contentPart.getKey(),ci.getUri());
return;
}
@@ -159,8 +158,13 @@ public class LangIdEnhancementEngine
// add language to metadata
MGraph g = ci.getMetadata();
- UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
+ ci.getLock().writeLock().lock();
+ try {
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
}
public int getProbeLength() {
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java Thu Feb 2 13:52:27 2012
@@ -16,20 +16,29 @@
*/
package org.apache.stanbol.enhancer.engines.metaxa;
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
+
+import java.io.BufferedWriter;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.Charset;
+import java.util.Arrays;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
+import java.util.Set;
import org.apache.clerezza.rdf.core.BNode;
-import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.impl.TypedLiteralImpl;
import org.apache.felix.scr.annotations.Component;
@@ -39,12 +48,14 @@ import org.apache.stanbol.enhancer.engin
import org.apache.stanbol.enhancer.engines.metaxa.core.RDF2GoUtils;
import org.apache.stanbol.enhancer.engines.metaxa.core.html.BundleURIResolver;
import org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlExtractorFactory;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryBlob;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.model.Model;
@@ -54,6 +65,7 @@ import org.ontoware.rdf2go.model.node.Da
import org.ontoware.rdf2go.model.node.Node;
import org.ontoware.rdf2go.model.node.PlainLiteral;
import org.ontoware.rdf2go.model.node.URI;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
@@ -80,6 +92,11 @@ public class MetaxaEngine
private static final Logger log = LoggerFactory.getLogger(MetaxaEngine.class);
/**
+ * Plain text content of a content item.
+ */
+ public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent");
+ private static final URIImpl NIE_PLAINTEXT_PROPERTY = new URIImpl(NIE_PLAINTEXTCONTENT.getUnicodeString());
+ /**
* The default value for the Execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_PRE_PROCESSING}
*/
@@ -97,6 +114,8 @@ public class MetaxaEngine
@Property(value=MetaxaEngine.DEFAULT_HTML_EXTRACTOR_REGISTRY)
public static final String HTML_EXTRACTOR_REGISTRY = "org.apache.stanbol.enhancer.engines.metaxa.htmlextractors";
+ @Property(value={"text/plain"},cardinality=1000)
+ public static final String IGNORE_MIME_TYPES = "org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";
private MetaxaCore extractor;
BundleContext bundleContext;
@@ -104,6 +123,8 @@ public class MetaxaEngine
public static final String DEFAULT_EXTRACTION_REGISTRY = "extractionregistry.xml";
public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = "htmlextractors.xml";
+ private Set<String> ignoredMimeTypes;
+
/**
* The activate method.
*
@@ -114,25 +135,38 @@ public class MetaxaEngine
super.activate(ce);
String extractionRegistry = DEFAULT_EXTRACTION_REGISTRY;
String htmlExtractors = DEFAULT_HTML_EXTRACTOR_REGISTRY;
- if (ce != null) {
- this.bundleContext = ce.getBundleContext();
- BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
- try {
- Dictionary<String, String> properties = ce.getProperties();
- String confFile = properties.get(GLOBAL_EXTRACTOR_REGISTRY);
- if (confFile != null && confFile.trim().length() > 0) {
- extractionRegistry = confFile;
- }
- confFile = properties.get(HTML_EXTRACTOR_REGISTRY);
- if (confFile != null && confFile.trim().length() > 0) {
- htmlExtractors = confFile;
+ this.bundleContext = ce.getBundleContext();
+ BundleURIResolver.BUNDLE = this.bundleContext.getBundle();
+ try {
+ Dictionary<String, Object> properties = ce.getProperties();
+ String confFile = (String)properties.get(GLOBAL_EXTRACTOR_REGISTRY);
+ if (confFile != null && confFile.trim().length() > 0) {
+ extractionRegistry = confFile;
+ }
+ confFile = (String)properties.get(HTML_EXTRACTOR_REGISTRY);
+ if (confFile != null && confFile.trim().length() > 0) {
+ htmlExtractors = confFile;
+ }
+ this.extractor = new MetaxaCore(extractionRegistry);
+ HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
+ } catch (IOException e) {
+ log.error(e.getLocalizedMessage(), e);
+ throw e;
+ }
+ Object value = ce.getProperties().get(IGNORE_MIME_TYPES);
+ if(value instanceof String[]){
+ ignoredMimeTypes = new HashSet<String>(Arrays.asList((String[])value));
+ } else if(value instanceof Iterable<?>){
+ ignoredMimeTypes = new HashSet<String>();
+ for(Object mimeType : (Iterable<?>)value){
+ if(mimeType != null){
+ ignoredMimeTypes.add(mimeType.toString());
}
- this.extractor = new MetaxaCore(extractionRegistry);
- HtmlExtractorFactory.REGISTRY_CONFIGURATION = htmlExtractors;
- } catch (IOException e) {
- log.error(e.getLocalizedMessage(), e);
- throw e;
}
+ } else if(value != null && !value.toString().isEmpty()){
+ ignoredMimeTypes = Collections.singleton(value.toString());
+ } else {
+ ignoredMimeTypes = Collections.singleton("text/plain");
}
}
@@ -147,18 +181,26 @@ public class MetaxaEngine
}
public int canEnhance(ContentItem ci) throws EngineException {
- String mimeType = ci.getMimeType().split(";", 2)[0];
- if (this.extractor.isSupported(mimeType)) {
- return ENHANCE_SYNCHRONOUS;
+ String mimeType = ci.getMimeType();
+ if (!ignoredMimeTypes.contains(mimeType) &&
+ this.extractor.isSupported(mimeType)) {
+ return ENHANCE_ASYNC; //supports now asynchronous execution!
}
return CANNOT_ENHANCE;
}
public void computeEnhancements(ContentItem ci) throws EngineException {
-
try {
// get model from the extraction
- Model m = this.extractor.extract(ci.getStream(), ci.getUri().getUnicodeString(), ci.getMimeType());
+ URIImpl docId;
+ Model m;
+ ci.getLock().readLock().lock();
+ try {
+ docId = new URIImpl(ci.getUri().getUnicodeString());
+ m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
+ } finally {
+ ci.getLock().readLock().unlock();
+ }
// add the statements from this model to the Metadata model
if (null != m) {
/*
@@ -166,28 +208,62 @@ public class MetaxaEngine
log.info(text);
*/
// get the model where to add the statements
- MGraph g = ci.getMetadata();
+ /*
+ * NOTE(rweten):
+ * There is no need to create an TextEnhancement to mark that
+ * a ContentItem was processed by Metaxa, because the
+ * ExecutionMetadata do record this anyway.
+ */
+ //
// create enhancement
- UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ //UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
// set confidence value to 1.0
- LiteralFactory literalFactory = LiteralFactory.getInstance();
- g.add(new TripleImpl(textEnhancement, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
+ //g.add(new TripleImpl(textEnhancement, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(1.0)));
RDF2GoUtils.urifyBlankNodes(m);
HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
ClosableIterator<Statement> it = m.iterator();
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
+ Charset charset = Charset.forName("UTF-8");
+ BufferedWriter out = new BufferedWriter(new OutputStreamWriter(byteOut, charset));
+ MGraph g = new SimpleMGraph(); //first add to a temporary graph
while (it.hasNext()) {
Statement oneStmt = it.next();
-
- NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
- UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
- Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
-
- if (null != subject && null != predicate && null != object) {
- Triple t = new TripleImpl(subject, predicate, object);
- g.add(t);
- log.debug("added " + t.toString());
+ //we need to treat triples that provide the plain/text
+ //version differently. Such Objects need to be added to
+ //the plain text Blob!
+ if(oneStmt.getSubject().equals(docId) &&
+ oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
+ out.write(oneStmt.getObject().toString());
+ } else { //add metadata to the metadata of the contentItem
+ NonLiteral subject = (NonLiteral) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
+ UriRef predicate = (UriRef) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
+ Resource object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
+
+ if (null != subject && null != predicate && null != object) {
+ Triple t = new TripleImpl(subject, predicate, object);
+ g.add(t);
+ log.debug("added " + t.toString());
+ }
}
}
+ ci.getLock().writeLock().lock();
+ try {
+ //now acquire a write lock and add the extracted
+ //metadata to the content item
+ ci.getMetadata().addAll(g);
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ out.close();
+ byte[] plainTextData = byteOut.toByteArray();
+ if(plainTextData.length > 0){
+ //add plain text to the content item
+ UriRef blobUri = new UriRef("urn:metaxa:plain-text:"+randomUUID());
+ Blob plainTextBlob = new InMemoryBlob(plainTextData,
+ "text/plain;charset="+charset.toString());
+ ci.addPart(blobUri, plainTextBlob);
+ //TODO: add contentPart metadata to the contentItem
+ }
it.close();
m.close();
}
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/MetaxaCore.java Thu Feb 2 13:52:27 2012
@@ -102,7 +102,7 @@ public class MetaxaCore {
* if there is an error when reading the input stream
*/
public Model extract(
- InputStream in, String docId, String mimeType)
+ InputStream in, URIImpl docId, String mimeType)
throws ExtractorException, IOException {
@SuppressWarnings("rawtypes")
@@ -116,7 +116,7 @@ public class MetaxaCore {
RDFContainerFactory containerFactory =
new RDFContainerFactoryImpl();
RDFContainer container =
- containerFactory.getRDFContainer(new URIImpl(docId));
+ containerFactory.getRDFContainer(docId);
extractor.extract(
container.getDescribedUri(),
new BufferedInputStream(in, 8192),
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties Thu Feb 2 13:52:27 2012
@@ -37,4 +37,8 @@ of a resource on the bundle classpath th
org.apache.stanbol.enhancer.engines.metaxa.htmlextractors.name=Html Extractors
org.apache.stanbol.enhancer.engines.metaxa.htmlextractors.description=The path of a \
-resource on the bundle classpath that specifies which extractors are used for HTML pages.
\ No newline at end of file
+resource on the bundle classpath that specifies which extractors are used for HTML pages.
+
+org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.name=Ignored Mime Types
+org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.description=This allows to \
+provide a list of MIME TYPES that are not processed by this engine.
\ No newline at end of file
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java Thu Feb 2 13:52:27 2012
@@ -35,6 +35,7 @@ import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.Statement;
import org.ontoware.rdf2go.model.node.BlankNode;
import org.ontoware.rdf2go.model.node.Variable;
+import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.vocabulary.NMO;
import org.slf4j.Logger;
@@ -88,7 +89,7 @@ public class TestMetaxaCore {
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
- Model m = extractor.extract(in, "file://" + testFile, "application/pdf");
+ Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
@@ -119,7 +120,7 @@ public class TestMetaxaCore {
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
- Model m = extractor.extract(in, "file://" + testFile, "text/html");
+ Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
@@ -149,7 +150,7 @@ public class TestMetaxaCore {
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
- Model m = extractor.extract(in, "file://" + testFile, "text/html");
+ Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
@@ -169,7 +170,7 @@ public class TestMetaxaCore {
String testFile = "mail-multipart-test.eml";
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
- Model m = extractor.extract(in, "file://" + testFile, "message/rfc822");
+ Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
assertTrue(textContained);
}
Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java Thu Feb 2 13:52:27 2012
@@ -23,7 +23,6 @@ import static org.apache.stanbol.enhance
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
@@ -44,9 +43,11 @@ import java.util.Collection;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
-import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
import org.apache.clerezza.rdf.core.Graph;
import org.apache.clerezza.rdf.core.Literal;
@@ -73,12 +74,14 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.osgi.framework.BundleContext;
import org.osgi.service.cm.ConfigurationException;
@@ -108,9 +111,10 @@ public class OpenCalaisEngine
/**
* This contains the directly supported MIME types of this enhancement engine.
- * For handling other mime-types the plain text must be contained in the metadata as by Metaxa.
*/
- protected static final List<String> SUPPORTED_MIMETYPES = Arrays.asList(new String[]{"text/plain", "text/html"});
+ protected static final Set<String> SUPPORTED_MIMETYPES =
+ Collections.unmodifiableSet(new HashSet<String>(
+ Arrays.asList("text/plain", "text/html")));
/**
* This contains a list of languages supported by OpenCalais.
@@ -118,7 +122,9 @@ public class OpenCalaisEngine
* it is left to the grace of the OpenCalais whether it accepts the text.
* OpenCalais uses its own language identifcation anyway.
*/
- protected static final List<String> SUPPORTED_LANGUAGES = Arrays.asList(new String[]{"en", "fr", "es"});
+ protected static final Set<String> SUPPORTED_LANGUAGES =
+ Collections.unmodifiableSet(new HashSet<String>(
+ Arrays.asList("en", "fr", "es")));
/**
* The default value for the Execution of this Engine. Currently set to
@@ -248,62 +254,53 @@ public class OpenCalaisEngine
}
public int canEnhance(ContentItem ci) throws EngineException {
- //Engine will no longer activate if no license key is set
-// if (getLicenseKey() == null || getLicenseKey().trim().length() == 0) {
-// //do nothing if no license key is defined
-// log.warn("No license key defined. The engine will not work!");
-// return CANNOT_ENHANCE;
-// }
- UriRef subj = ci.getUri();
- String mimeType = ci.getMimeType().split(";", 2)[0];
- if (SUPPORTED_MIMETYPES.contains(mimeType.toLowerCase())) {
- // check language
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){
String language = getMetadataLanguage(ci.getMetadata(), null);
if (language != null && !SUPPORTED_LANGUAGES.contains(language)) {
- log.warn("Wrong language for Calais: {}", language);
+ log.info("OpenCalais can not process ContentItem {} because "
+ + "language {} is not supported (supported: {})",
+ new Object[]{ci.getUri(),language,SUPPORTED_LANGUAGES});
return CANNOT_ENHANCE;
}
- return ENHANCE_SYNCHRONOUS;
- } else {
- // TODO: check whether the metadata graph contains the text
- Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
- if (it.hasNext()) {
- return ENHANCE_SYNCHRONOUS;
- }
- }
+ return ENHANCE_ASYNC; //OpenCalais now support async processing!
+ }
return CANNOT_ENHANCE;
}
public void computeEnhancements(ContentItem ci) throws EngineException {
- String mimeType = ci.getMimeType().split(";", 2)[0].toLowerCase();
- String text = "";
- if (SUPPORTED_MIMETYPES.contains(mimeType)) {
- try {
- text = IOUtils.toString(ci.getStream(),"UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- } else {
- mimeType = "text/plain";
- text = getMetadataText(ci.getMetadata(), ci.getUri());
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with an supported Mimetype '"
+ + SUPPORTED_MIMETYPES+"' found for ContentItem "+ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
}
- if (text == null) {
- log.warn("no text found");
- return;
+ String text;
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
}
- MGraph calaisModel = getCalaisAnalysis(text, mimeType);
+ MGraph calaisModel = getCalaisAnalysis(text, contentPart.getValue().getMimeType());
if (calaisModel != null) {
- createEnhancements(queryModel(calaisModel), ci);
- if (log.isDebugEnabled()) {
- Serializer serializer = Serializer.getInstance();
- ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
- serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
- try {
- log.debug("Calais Enhancements:\n{}",debugStream.toString("UTF-8"));
- } catch (UnsupportedEncodingException e) {
- e.printStackTrace();
- }
+ //Acquire a write lock on the ContentItem when adding the enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ createEnhancements(queryModel(calaisModel), ci);
+ if (log.isDebugEnabled()) {
+ Serializer serializer = Serializer.getInstance();
+ ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+ serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
+ try {
+ log.debug("Calais Enhancements:\n{}",debugStream.toString("UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
}
}
@@ -587,17 +584,6 @@ public class OpenCalaisEngine
urlConn.getInputStream(), responseEncoding);
}
- public String getMetadataText(MGraph model, NonLiteral subj) {
- String text = "";
- for (Iterator<Triple> it = model.filter(subj, NIE_PLAINTEXTCONTENT, null); it.hasNext();) {
- text += getLexicalForm(it.next().getObject());
- }
- if (text.trim().length() > 0) {
- return text;
- }
- return null;
- }
-
public String getMetadataLanguage(MGraph model, NonLiteral subj) {
Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null);
if (it.hasNext()) {
@@ -624,21 +610,19 @@ public class OpenCalaisEngine
*/
protected void activate(ComponentContext ce) throws ConfigurationException {
super.activate(ce);
- if (ce != null) {
- this.bundleContext = ce.getBundleContext();
- //TODO initialize Extractor
- Dictionary<String, String> properties = ce.getProperties();
- String license = properties.get(LICENSE_KEY);
- String url = properties.get(CALAIS_URL_KEY);
- calaisTypeMapFile = properties.get(CALAIS_TYPE_MAP_KEY);
- String standAlone = properties.get(CALAIS_NER_ONLY_MODE_KEY);
- setLicenseKey(license);
- setCalaisUrl(url);
- calaisTypeMap = new HashMap<UriRef,UriRef>();
- loadTypeMap(calaisTypeMapFile);
- onlyNERMode = Boolean.parseBoolean(standAlone);
- // this.tcManager = TcManager.getInstance();
- }
+ this.bundleContext = ce.getBundleContext();
+ //TODO initialize Extractor
+ Dictionary<String, Object> properties = ce.getProperties();
+ String license = (String)properties.get(LICENSE_KEY);
+ String url = (String)properties.get(CALAIS_URL_KEY);
+ calaisTypeMapFile = (String)properties.get(CALAIS_TYPE_MAP_KEY);
+ String standAlone = (String)properties.get(CALAIS_NER_ONLY_MODE_KEY);
+ setLicenseKey(license);
+ setCalaisUrl(url);
+ calaisTypeMap = new HashMap<UriRef,UriRef>();
+ loadTypeMap(calaisTypeMapFile);
+ onlyNERMode = Boolean.parseBoolean(standAlone);
+ // this.tcManager = TcManager.getInstance();
}
/**