You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/02 13:44:42 UTC

svn commit: r1404953 - in /stanbol/trunk: enhancer/defaults/src/main/resources/config/ enhancer/engines/opennlp-ner/ enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ enhancer/engines/opennlp-ner/src/test/java...

Author: rwesten
Date: Fri Nov  2 12:44:42 2012
New Revision: 1404953

URL: http://svn.apache.org/viewvc?rev=1404953&view=rev
Log:
STANBOL-792: Changes to the existing NER engine in preparation for the implementation of this issue

* Engine does now support ConfigurationPolicy.OPTIONAL. because of that the default configuration for it in the enhancer.defaults module is no longer required
* The Engine does now correctly report itself as dc:creator (previouse the NEREngineCore class was reported). Because of that regex statements used by integration tests needed to be updated. This change will allow to distinquish enhancement comming from the NamedEntityExtractionEngine with those to be implemented for STANBOL-792
* Created a new NerEngineConfig class that already allows to configure specific NER modles for languages AND also supports the configuration of custom "NER type -> RDF type" mappings
* Extended the NEREngineCore to support specific NER models AND NER type -> RDF type mappings.

So what is missing for STANBOL-792 is the implementation of the Engine for custom models and unit tests for those features.

Added:
    stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java   (with props)
Removed:
    stanbol/trunk/enhancer/defaults/src/main/resources/config/org.apache.stanbol.enhancer.engines.opennlp.impl.NamedEntityExtractionEnhancementEngine-default.config
Modified:
    stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
    stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
    stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
    stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
    stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
    stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/MultipartRequestTest.java

Modified: stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml Fri Nov  2 12:44:42 2012
@@ -79,7 +79,7 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.opennlp</artifactId>
-      <version>0.9.0-incubating</version>
+      <version>0.10.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>

Added: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java?rev=1404953&view=auto
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java (added)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java Fri Nov  2 12:44:42 2012
@@ -0,0 +1,110 @@
+package org.apache.stanbol.enhancer.engines.opennlp.impl;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+
+public class NEREngineConfig {
+    /**
+     * Default mapping for Concept types to dc:type values added for
+     * TextAnnotations.
+     */
+    public static final Map<String,UriRef> DEFAULT_ENTITY_TYPE_MAPPINGS;
+    
+    static { //the default mappings for the default NER types
+        Map<String,UriRef> mappings = new TreeMap<String,UriRef>();
+        mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(), OntologicalClasses.DBPEDIA_ORGANISATION);
+        mappings.put("person", OntologicalClasses.DBPEDIA_PERSON);
+        mappings.put("location", OntologicalClasses.DBPEDIA_PLACE);
+        mappings.put("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+        DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
+    }
+    
+    /**
+     * Holds the mappings of rdf:type used by concepts to dc:type values used
+     * by TextAnnotations. 
+     */
+    private Map<String,UriRef> typeMappings = new HashMap<String,UriRef>(DEFAULT_ENTITY_TYPE_MAPPINGS);
+    
+    private Map<String,Collection<String>> additionalNerModels = new HashMap<String,Collection<String>>();
+    /**
+     * The default model types
+     */
+    private Set<String> defaultModelTypes = new HashSet<String>(DEFAULT_ENTITY_TYPE_MAPPINGS.keySet());
+    /**
+     * TODO: replace with Language as soon as STANBOL-733 is re-integrated with
+     * the Stanbol trunk
+     */
+    private Set<String> processedLanguages = new HashSet<String>();
+    
+    private String defaultLanguage;
+    
+    public void addNerModel(String lang, String modelFileName){
+        if(lang == null || lang.isEmpty()){
+            throw new IllegalArgumentException("The parsed lanaguage MUST NOT be NULL or empty!");
+        }
+        if(modelFileName == null || modelFileName.isEmpty()){
+            throw new IllegalArgumentException("The parsed NER model name MUST NOT be NULL or empty!");
+        }
+        Collection<String> langModels = additionalNerModels.get(lang);
+        if(langModels == null){
+            langModels = new ArrayList<String>();
+            additionalNerModels.put(lang, langModels);
+        }
+        langModels.add(modelFileName);
+    }
+        
+    public Set<String> getProcessedLanguages() {
+        return processedLanguages;
+    }
+    /**
+     * Checks if the parsed language is enabled for processing.
+     * If <code>null</code> is parsed as language this returns <code>false</code>
+     * even if processing of all languages is enabled. <p>
+     * NOTE: If this Method returns <code>true</code> this does
+     * not mean that text with this language can be actually processed because this
+     * also requires that the NER model for this language are available via the
+     * parsed {@link OpenNLP} instance.
+     * @param lang the language
+     * @return the state
+     */
+    public boolean isProcessedLangage(String lang){
+        return lang != null && (processedLanguages.isEmpty() || processedLanguages.contains(lang));
+    }
+    
+    public void setDefaultLanguage(String defaultLanguage) {
+        this.defaultLanguage = defaultLanguage;
+    }
+    
+    public String getDefaultLanguage() {
+        return defaultLanguage;
+    }
+    
+    public void setDefaultModelTypes(Set<String> defaultModelTypes) {
+        this.defaultModelTypes = defaultModelTypes;
+    }
+    
+    public Set<String> getDefaultModelTypes() {
+        return defaultModelTypes;
+    }
+    
+    @SuppressWarnings("unchecked")
+    public Collection<String> getSpecificNerModles(String lang){
+        Collection<String> modelNames = additionalNerModels.get(lang);
+        return modelNames == null ? Collections.EMPTY_LIST : modelNames;
+    }
+    
+    public UriRef getMappedType(String namedEntityType){
+        return typeMappings.get(namedEntityType);
+    }
+}

Propchange: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Fri Nov  2 12:44:42 2012
@@ -65,15 +65,19 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Core of our EnhancementEngine, separated from the OSGi service to make it easier to test this.
+ * Core of the NER EnhancementEngine(s), separated from the OSGi service to make 
+ * it easier to test this.
  */
-public class NEREngineCore implements EnhancementEngine {
+public abstract class NEREngineCore 
+        extends AbstractEnhancementEngine<IOException,RuntimeException> 
+        implements EnhancementEngine {
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
      * Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE}
@@ -82,18 +86,10 @@ public class NEREngineCore implements En
             Collections.singleton(TEXT_PLAIN_MIMETYPE);
 
     private final Logger log = LoggerFactory.getLogger(getClass());
-    private static Map<String,UriRef> entityTypes = new HashMap<String,UriRef>();
-    static {
-        entityTypes.put("person", OntologicalClasses.DBPEDIA_PERSON);
-        entityTypes.put("location", OntologicalClasses.DBPEDIA_PLACE);
-        entityTypes.put("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
-    }
     
-    private OpenNLP openNLP;
-
-    private final String defaultLang;
-
-    private final Set<String> processedLangs;
+    protected OpenNLP openNLP;
+    
+    protected NEREngineConfig config;
     
     /** Comments about our models */
     public static final Map<String, String> DATA_FILE_COMMENTS;
@@ -101,28 +97,28 @@ public class NEREngineCore implements En
         DATA_FILE_COMMENTS = new HashMap<String, String>();
         DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
     }
-
-    public NEREngineCore(OpenNLP openNLP, String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException{
+    /**
+     * If used sub classes MUST ensure that {@link #openNLP} and {@link #config}
+     * are set before calling {@link #canEnhance(ContentItem)} or
+     * {@link #computeEnhancements(ContentItem)}
+     */
+    protected NEREngineCore(){}
+    
+    NEREngineCore(OpenNLP openNLP, NEREngineConfig config) throws InvalidFormatException, IOException{
+        if(openNLP == null){
+            throw new IllegalArgumentException("The parsed OpenNLP instance MUST NOT be NULL!");
+        }
+        if(config == null){
+            throw new IllegalArgumentException("The parsed NER engine configuration MUST NOT be NULL!");
+        }
         this.openNLP = openNLP;
-        this.defaultLang = defaultLanguage;
-        this.processedLangs = Collections.unmodifiableSet(processedLanguages);
+        this.config = config;
     }
     
-    NEREngineCore(DataFileProvider dfp,String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException {
-        this(new OpenNLP(dfp),defaultLanguage,processedLanguages);
+    NEREngineCore(DataFileProvider dfp,NEREngineConfig config) throws InvalidFormatException, IOException {
+        this(new OpenNLP(dfp),config);
     }
 
-//    protected TokenNameFinderModel buildNameModel(String name, UriRef typeUri) throws IOException {
-//        //String modelRelativePath = String.format("en-ner-%s.bin", name);
-//        TokenNameFinderModel model = openNLP.getNameModel(name, "en");
-//        // register the name finder instances for matching owl class
-////        entityTypes.put(name, new Object[] {typeUri, model});
-//        return model;
-//    }
-    @Override
-    public String getName() {
-        return getClass().getName();
-    }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
         //first check the langauge before processing the content (text)
@@ -133,10 +129,9 @@ public class NEREngineCore implements En
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
-        if(!isProcessedLangage(language)){
-            throw new IllegalStateException("The language '"+language+"' of ContentItem "+ci.getUri() 
-                + " is not configured to be processed by this NER engine instance "
-                + "(processed "+processedLangs+"): This is also checked in the canEnhance "
+        if(!isNerModel(language)){
+            throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri() 
+                + " no NER model is configured: This is also checked in the canEnhance "
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
@@ -167,14 +162,29 @@ public class NEREngineCore implements En
             new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(), 
                          StringUtils.abbreviate(text, 100)});
         try {
-            for (Map.Entry<String,UriRef> type : entityTypes.entrySet()) {
-                String typeLabel = type.getKey();
-                UriRef typeUri = type.getValue();
-                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(typeLabel, language);
-                if(nameFinderModel == null){
-                    log.info("No NER Model for {} and language {} available!",typeLabel,language);
-                } else {
-                    findNamedEntities(ci, text, language, typeUri, typeLabel, nameFinderModel);
+            if(config.isProcessedLangage(language)){
+                for (String defaultModelType : config.getDefaultModelTypes()) {
+                    TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
+                    if(nameFinderModel == null){
+                        log.info("No NER Model for {} and language {} available!",defaultModelType,language);
+                    } else {
+                        findNamedEntities(ci, text, language, nameFinderModel);
+                    }
+                }
+            } //else do not use default models for languages other than the processed one
+            //process for additional models
+            for(String additionalModel : config.getSpecificNerModles(language)){
+                TokenNameFinderModel nameFinderModel;
+                try {
+                    nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, 
+                        additionalModel, null);
+                    findNamedEntities(ci, text, language, nameFinderModel);
+                } catch (IOException e) {
+                    log.warn("Unable to load TokenNameFinderModel model for language '"+language
+                        + "' (model: "+additionalModel+")",e);
+                } catch (RuntimeException e){
+                    log.warn("Error while creating ChunkerModel for language '"+language
+                        + "' (model: "+additionalModel+")",e);
                 }
             }
         } catch (Exception e) {
@@ -189,8 +199,6 @@ public class NEREngineCore implements En
     protected void findNamedEntities(final ContentItem ci,
                                      final String text,
                                      final String lang,
-                                     final UriRef typeUri,
-                                     final String typeLabel,
                                      final TokenNameFinderModel nameFinderModel) {
 
         if (ci == null) {
@@ -206,8 +214,10 @@ public class NEREngineCore implements En
         } else {
             language = null;
         }
-        log.debug("findNamedEntities typeUri={}, type={}, text=", 
-                new Object[]{ typeUri, typeLabel, StringUtils.abbreviate(text, 100) });
+        if(log.isDebugEnabled()){
+            log.debug("findNamedEntities model={},  language={}, text=", 
+                    new Object[]{ nameFinderModel, language, StringUtils.abbreviate(text, 100) });
+        }
         LiteralFactory literalFactory = LiteralFactory.getInstance();
         MGraph g = ci.getMetadata();
         Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
@@ -228,7 +238,9 @@ public class NEREngineCore implements En
                         new PlainLiteralImpl(name, language)));
                     g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, 
                         new PlainLiteralImpl(occurrence.context, language)));
-                    g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+                    if(occurrence.type != null){
+                        g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
+                    }
                     g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
                             .createTypedLiteral(occurrence.confidence)));
                     if (occurrence.start != null && occurrence.end != null) {
@@ -399,8 +411,9 @@ public class NEREngineCore implements En
                 int start = tokenSpans[nameSpans[j].getStart()].getStart();
                 int absoluteStart = sentenceSpans[i].getStart() + start;
                 int absoluteEnd = absoluteStart + name.length();
-                NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
-                        confidence);
+                UriRef mappedType = config.getMappedType(nameSpans[j].getType());
+                NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, 
+                    mappedType, context, confidence);
 
                 List<NameOccurrence> occurrences = nameOccurrences.get(name);
                 if (occurrences == null) {
@@ -416,11 +429,12 @@ public class NEREngineCore implements En
     }
 
     public int canEnhance(ContentItem ci) {
-        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null 
-                && isProcessedLangage(extractLanguage(ci))){
-                return ENHANCE_ASYNC; //The NER engine now supports Async processing!
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
+                isNerModel(extractLanguage(ci))){
+            return ENHANCE_ASYNC;
+        } else {
+            return CANNOT_ENHANCE;
         }
-        return CANNOT_ENHANCE;
     }
 
     /**
@@ -445,38 +459,6 @@ public class NEREngineCore implements En
     }
 
     /**
-     * The default language
-     * @return the defaultLang
-     */
-    public String getDefaultLanguage() {
-        return defaultLang;
-    }
-    /**
-     * Checks if the parsed language is enabled for processing.
-     * If <code>null</code> is parsed as language this returns <code>false</code>
-     * even if processing of all languages is enabled. <p>
-     * NOTE: If this Method returns <code>true</code> this does
-     * not mean that text with this language can be actually processed because this
-     * also requires that the NER model for this language are available via the
-     * parsed {@link OpenNLP} instance.
-     * @param lang the language
-     * @return the state
-     */
-    public boolean isProcessedLangage(String lang){
-        return lang != null && (processedLangs.isEmpty() || processedLangs.contains(lang));
-    }
-    /*
-     * The following Utility extracts the language from the metadata of the
-     * parsed Content Item.
-     * This Utility is actually a copy of the same form the KeywordExtractionEngine.
-     * TODO: change this to a global Utility as soon as STANBOL Enhancement
-     * Structure is defined
-     */
-    /**
-     * The literal representing the LangIDEngine as creator.
-     */
-    public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-    /**
      * Extracts the language of the parsed ContentItem by using
      * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and 
      * {@link #defaultLang} as default
@@ -485,26 +467,26 @@ public class NEREngineCore implements En
      */
     private String extractLanguage(ContentItem ci) {
         String lang = EnhancementEngineHelper.getLanguage(ci);
-//        MGraph metadata = ci.getMetadata();
-//        Iterator<Triple> langaugeEnhancementCreatorTriples = 
-//            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
-//        if(langaugeEnhancementCreatorTriples.hasNext()){
-//            String lang = EnhancementEngineHelper.getString(metadata, 
-//                langaugeEnhancementCreatorTriples.next().getSubject(), 
-//                Properties.DC_LANGUAGE);
         if(lang != null){
             return lang;
         } else {
-            log.info("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
-                new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
-            log.info(" ... return '{}' as default",defaultLang);
-            return defaultLang;
-        }
-//        } else {
-//            log.info("Unable to extract language for ContentItem {}! Is the {} active?",
-//                ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
-//            log.info(" ... return '{}' as default",defaultLang);
-//            return defaultLang;
-//        }
+            log.info("Unable to extract language for ContentItem %s!",ci.getUri().getUnicodeString());
+            log.info(" ... return '{}' as default",config.getDefaultLanguage());
+            return config.getDefaultLanguage();
+        }
+    }
+    /**
+     * This Method checks if this configuration does have a NER model for the
+     * parsed language. This checks if the pased language 
+     * {@link #isProcessedLangage(String)} and any {@link #getDefaultModelTypes()}
+     * is present OR if any {@link #getSpecificNerModles(String)} is configured for the
+     * parsed language.
+     * @param lang The language to check
+     * @return if there is any NER model configured for the parsed language
+     */
+    public boolean isNerModel(String lang){
+        return (config.isProcessedLangage(lang) && !config.getDefaultModelTypes().isEmpty()) ||
+               !config.getSpecificNerModles(lang).isEmpty();
+                
     }
 }

Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java Fri Nov  2 12:44:42 2012
@@ -16,6 +16,8 @@
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;
 
+import org.apache.clerezza.rdf.core.UriRef;
+
 public class NameOccurrence {
 
     public final String name;
@@ -28,11 +30,14 @@ public class NameOccurrence {
 
     public final Double confidence;
 
-    public NameOccurrence(String name, Integer start, Integer end,
+    public final UriRef type;
+
+    public NameOccurrence(String name, Integer start, Integer end, UriRef type,
             String context, Double confidence) {
         this.start = start;
         this.end = end;
         this.name = name;
+        this.type = type;
         this.context = context;
         this.confidence = confidence;
     }
@@ -40,8 +45,8 @@ public class NameOccurrence {
     @Override
     public String toString() {
         return String.format(
-                "[name='%s', start='%d', end='%d', confidence='%f', context='%s']",
-                name, start, end, confidence, context);
+                "[name='%s', start='%d', end='%d', type='%s', confidence='%f', context='%s']",
+                name, start, end, type, confidence, context);
     }
 
 }

Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java Fri Nov  2 12:44:42 2012
@@ -20,23 +20,19 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.Map;
-import java.util.Set;
 
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
-import org.apache.stanbol.enhancer.servicesapi.ContentItem;
-import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.framework.Constants;
-import org.osgi.framework.ServiceRegistration;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 
@@ -49,7 +45,7 @@ import org.osgi.service.component.Compon
     immediate = true,
     inherit = true,
     configurationFactory = true, 
-    policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
+    policy = ConfigurationPolicy.OPTIONAL,
     specVersion = "1.1", 
     label = "%stanbol.NamedEntityExtractionEnhancementEngine.name", 
     description = "%stanbol.NamedEntityExtractionEnhancementEngine.description")
@@ -58,14 +54,16 @@ import org.osgi.service.component.Compon
     @Property(name=EnhancementEngine.PROPERTY_NAME,value="ner"),
     @Property(name=NamedEntityExtractionEnhancementEngine.PROCESSED_LANGUAGES,value=""),
     @Property(name=NamedEntityExtractionEnhancementEngine.DEFAULT_LANGUAGE,value=""),
-    @Property(name=Constants.SERVICE_RANKING,intValue=0)
+    //set the ranking of the default config to a negative value (ConfigurationPolicy.OPTIONAL) 
+    @Property(name=Constants.SERVICE_RANKING,intValue=-100) 
 })
+@Reference(name="openNLP",referenceInterface=OpenNLP.class, 
+    cardinality=ReferenceCardinality.MANDATORY_UNARY,
+    policy=ReferencePolicy.STATIC)
 public class NamedEntityExtractionEnhancementEngine 
-        extends AbstractEnhancementEngine<IOException,RuntimeException> 
+        extends NEREngineCore
         implements EnhancementEngine, ServiceProperties {
 
-    private EnhancementEngine engineCore;
-    
     public static final String DEFAULT_DATA_OPEN_NLP_MODEL_LOCATION = "org/apache/stanbol/defaultdata/opennlp";
 
     /**
@@ -89,61 +87,60 @@ public class NamedEntityExtractionEnhanc
      * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
      */
     public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
-    private ServiceRegistration dfpServiceRegistration;
-    
-    @Reference
-    private OpenNLP openNLP;
+    /**
+     * Bind method of {@link NEREngineCore#openNLP}
+     * @param openNlp
+     */
+    protected void bindOpenNLP(OpenNLP openNlp){
+        this.openNLP = openNlp;
+    }
+    /**
+     * Unbind method of {@link NEREngineCore#openNLP}
+     * @param openNLP
+     */
+    protected void unbindOpenNLP(OpenNLP openNLP){
+        this.openNLP = null;
+    }
     
     protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
         super.activate(ctx);
+        config = new NEREngineConfig();
         // Need to register the default data before loading the models
         Object value = ctx.getProperties().get(DEFAULT_LANGUAGE);
-        final String defaultLanguage;
         if(value != null && !value.toString().isEmpty()){
-            defaultLanguage = value.toString();
-        } else {
-            defaultLanguage = null;
-        }
+            config.setDefaultLanguage(value.toString());
+        } //else no default language
+        
         value = ctx.getProperties().get(PROCESSED_LANGUAGES);
-        final Set<String> processedLanguages;
         if(value instanceof String[]){
-            processedLanguages = new HashSet<String>(Arrays.asList((String[]) value));
-            processedLanguages.remove(null); //remove null
-            processedLanguages.remove(""); //remove empty
+            config.getProcessedLanguages().addAll(Arrays.asList((String[]) value));
+            config.getProcessedLanguages().remove(null); //remove null
+            config.getProcessedLanguages().remove(""); //remove empty
         } else if (value instanceof Collection<?>){
-            processedLanguages = new HashSet<String>();
             for(Object o : ((Collection<?>)value)){
                 if(o != null){
-                    processedLanguages.add(o.toString());
+                    config.getProcessedLanguages().add(o.toString());
                 }
             }
-            processedLanguages.remove(""); //remove empty
+            config.getProcessedLanguages().remove(""); //remove empty
         } else if(value != null && !value.toString().isEmpty()){
             //if a single String is parsed we support ',' as seperator
             String[] languageArray = value.toString().split(",");
-            processedLanguages = new HashSet<String>(Arrays.asList(languageArray));
-            processedLanguages.remove(null); //remove null
-            processedLanguages.remove(""); //remove empty
-        } else { //no configuration
-            processedLanguages = Collections.emptySet();
-        }
-        if(!processedLanguages.isEmpty() && defaultLanguage != null &&
-                !processedLanguages.contains(defaultLanguage)){
+            config.getProcessedLanguages().addAll(Arrays.asList(languageArray));
+            config.getProcessedLanguages().remove(null); //remove null
+            config.getProcessedLanguages().remove(""); //remove empty
+        } //else no configuration
+        if(!config.getProcessedLanguages().isEmpty() && config.getDefaultLanguage() != null &&
+                !config.getProcessedLanguages().contains(config.getDefaultLanguage())){
             throw new ConfigurationException(PROCESSED_LANGUAGES, "The list of" +
-            		"processed Languages "+processedLanguages+" MUST CONTAIN the" +
-            		"configured default language '"+defaultLanguage+"'!");
+            		"processed Languages "+config.getProcessedLanguages()+" MUST CONTAIN the" +
+            		"configured default language '"+config.getDefaultLanguage()+"'!");
         }
-        engineCore = new NEREngineCore(openNLP, defaultLanguage, processedLanguages);
     }
 
     protected void deactivate(ComponentContext ctx) {
+        config = null;
         super.deactivate(ctx);
-        if(dfpServiceRegistration != null) {
-            dfpServiceRegistration.unregister();
-            dfpServiceRegistration = null;
-        }
-        engineCore = null;
     }
     
     @Override
@@ -152,22 +149,22 @@ public class NamedEntityExtractionEnhanc
             (Object) defaultOrder));
     }
 
-    @Override
-    public int canEnhance(ContentItem ci) throws EngineException {
-        checkCore();
-        return engineCore.canEnhance(ci);
-    }
-
-    @Override
-    public void computeEnhancements(ContentItem ci) throws EngineException {
-        checkCore();
-        engineCore.computeEnhancements(ci);
-    }
+//    @Override
+//    public int canEnhance(ContentItem ci) throws EngineException {
+//        checkCore();
+//        return engineCore.canEnhance(ci);
+//    }
+
+//    @Override
+//    public void computeEnhancements(ContentItem ci) throws EngineException {
+//        checkCore();
+//        engineCore.computeEnhancements(ci);
+//    }
     
-    private void checkCore() {
-        if(engineCore == null) {
-            throw new IllegalStateException("EngineCore not initialized");
-        }
-    }
+//    private void checkCore() {
+//        if(engineCore == null) {
+//            throw new IllegalStateException("EngineCore not initialized");
+//        }
+//    }
 
 }
\ No newline at end of file

Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Fri Nov  2 12:44:42 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;
 
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
 
 import java.io.IOException;
@@ -29,6 +30,8 @@ import org.apache.clerezza.rdf.core.Lite
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -59,16 +62,19 @@ public class TestNamedEntityExtractionEn
     
     public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";
 
-    @SuppressWarnings("unchecked")
     @BeforeClass
     public static void setUpServices() throws IOException {
         nerEngine = new NEREngineCore(new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME),
-            "en",Collections.EMPTY_SET);
+            new NEREngineConfig()){};
     }
 
     public static ContentItem wrapAsContentItem(final String id,
-            final String text) throws IOException {
-    	return ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+            final String text, String language) throws IOException {
+    	ContentItem ci =  ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+    	if(language != null){
+    	    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl(language)));
+    	}
+    	return ci;
     }
 
     @Test
@@ -124,7 +130,7 @@ public class TestNamedEntityExtractionEn
     @Test
     public void testComputeEnhancements()
             throws EngineException, IOException {
-        ContentItem ci = wrapAsContentItem("my doc id", SINGLE_SENTENCE);
+        ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", SINGLE_SENTENCE,"en");
         nerEngine.computeEnhancements(ci);
         Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
         expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());

Modified: stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java (original)
+++ stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java Fri Nov  2 12:44:42 2012
@@ -76,7 +76,7 @@ public class DefaultChainTest extends En
                 "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
                 "http://purl.org/dc/terms/language.*en",
                 "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-                "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+                "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
                 "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley",
                 //the following two lines test the use of plain literals (see STANBOL-509)
                 "http://fise.iks-project.eu/ontology/selected-text.*\"Bob Marley\"@en",

Modified: stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/MultipartRequestTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/MultipartRequestTest.java?rev=1404953&r1=1404952&r2=1404953&view=diff
==============================================================================
--- stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/MultipartRequestTest.java (original)
+++ stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/MultipartRequestTest.java Fri Nov  2 12:44:42 2012
@@ -184,7 +184,7 @@ public class MultipartRequestTest extend
             "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
             "http://purl.org/dc/terms/language.*en",
             "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
             "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley")
         .getContent();
         log.debug("Content:\n{}\n",content);
@@ -223,7 +223,7 @@ public class MultipartRequestTest extend
             "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
             "http://purl.org/dc/terms/language.*en",
             "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
             "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley")
         .getContent();
         log.debug("Content:\n{}\n",content);
@@ -258,7 +258,7 @@ public class MultipartRequestTest extend
             "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
             "http://purl.org/dc/terms/language.*en",
             "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
             "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley")
         .getContent();
         log.debug("Content:\n{}\n",content);
@@ -294,7 +294,7 @@ public class MultipartRequestTest extend
              "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
              "http://purl.org/dc/terms/language.*en",
              "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-             "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+             "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
              "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley")
          .getContent();
         log.debug("Content:\n{}\n",content);
@@ -371,7 +371,7 @@ public class MultipartRequestTest extend
             "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
             "http://purl.org/dc/terms/language.*en",
             "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
             "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley",
             //check also for expeted entities extracted from the secret Text part!
             "http://fise.iks-project.eu/ontology/entity-label.*Berlin",
@@ -512,7 +512,7 @@ public class MultipartRequestTest extend
             "http://purl.org/dc/terms/creator.*LanguageDetectionEnhancementEngine",
             "http://purl.org/dc/terms/language.*en",
             "http://fise.iks-project.eu/ontology/entity-label.*Paris",
-            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
+            "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*NamedEntityExtractionEnhancementEngine",
             "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley",
             //additional enhancements based on parsed metadata
             "http://fise.iks-project.eu/ontology/entity-reference.*http://dbpedia.org/resource/Germany.*",