You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/05 11:22:02 UTC
svn commit: r1405731 - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner: ./ src/license/ src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ src/main/resources/OSGI-INF/metatype/ src/test/java/org/apache/stanbol/enha...

Author: rwesten
Date: Mon Nov  5 10:22:01 2012
New Revision: 1405731

URL: http://svn.apache.org/viewvc?rev=1405731&view=rev
Log:
STANBOL-792: merged implementation to the stanbol-nlp-processing branch

Added:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/CustomNERModelEnhancementEngine.java
      - copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/CustomNERModelEnhancementEngine.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
      - copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/
      - copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/
      - copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/
      - copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/
      - copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/
      - copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/LICENSE
      - copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/LICENSE
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/README.md
      - copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/README.md
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/bionlp2004-DNA-en.bin
      - copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/bionlp2004-DNA-en.bin
Modified:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/   (props changed)
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java

Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Mon Nov  5 10:22:01 2012
@@ -0,0 +1,3 @@
+/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/opennlp-ner:1374978-1386535
+/incubator/stanbol/trunk/enhancer/engines/opennlp-ner:1339554,1339557-1339558,1386989-1388016
+/stanbol/trunk/enhancer/engines/opennlp-ner:1388017-1405730

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml Mon Nov  5 10:22:01 2012
@@ -38,12 +38,12 @@
 
   <scm>
     <connection>
-      scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/
+      scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-ner/
     </connection>
     <developerConnection>
-      scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/
+      scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-ner/
     </developerConnection>
-    <url>http://incubator.apache.org/stanbol/</url>
+    <url>http://stanbol.apache.org/</url>
   </scm>
 
 
@@ -60,6 +60,8 @@
             </Private-Package>
             <Import-Package>
               !net.didion.*,
+              org.apache.stanbol.enhancer.servicesapi; provide:=true,
+              org.apache.stanbol.enhancer.servicesapi.impl; provide:=true,
               *
             </Import-Package>
           </instructions>
@@ -77,7 +79,7 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.commons.opennlp</artifactId>
-      <version>0.9.0-incubating</version>
+      <version>0.10.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties Mon Nov  5 10:22:01 2012
@@ -1,18 +1,25 @@
 # Generated by org.codehaus.mojo.license.AddThirdPartyMojo
 #-------------------------------------------------------------------------------
 # Already used licenses in project :
-# - Apache License
-# - Common Development and Distribution License (CDDL) v1.0
-# - Common Public License Version 1.0
+# - Apache Software License
+# - Apache Software License, Version 2.0
+# - BSD License
+# - Common Development And Distribution License (CDDL), Version 1.0
+# - Common Development And Distribution License (CDDL), Version 1.1
+# - Common Public License, Version 1.0
+# - Eclipse Public License, Version 1.0
+# - GNU General Public License (GPL), Version 2 with classpath exception
+# - GNU Lesser General Public License (LGPL)
+# - GNU Lesser General Public License (LGPL), Version 2.1
 # - ICU License
 # - MIT License
-# - The Apache Software License, Version 2.0
+# - Public Domain License
 #-------------------------------------------------------------------------------
 # Please fill the missing licenses for dependencies :
 #
 #
-#Wed Feb 15 19:06:02 CET 2012
-javax.servlet--servlet-api--2.4=Common Development And Distribution License (CDDL), Version 1.0
+#Sun Oct 07 16:31:16 CEST 2012
+javax.servlet--servlet-api--2.5=Common Development And Distribution License (CDDL), Version 1.0
 jwnl--jwnl--1.3.3=BSD License
 org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
 org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Mon Nov  5 10:22:01 2012
@@ -65,15 +65,19 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Core of our EnhancementEngine, separated from the OSGi service to make it easier to test this.
+ * Core of the NER EnhancementEngine(s), separated from the OSGi service to make 
+ * it easier to test this.
  */
-public class NEREngineCore implements EnhancementEngine {
+public abstract class NEREngineCore 
+        extends AbstractEnhancementEngine<IOException,RuntimeException> 
+        implements EnhancementEngine {
     protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
     /**
      * Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE}
@@ -82,18 +86,10 @@ public class NEREngineCore implements En
             Collections.singleton(TEXT_PLAIN_MIMETYPE);
 
     private final Logger log = LoggerFactory.getLogger(getClass());
-    private static Map<String,UriRef> entityTypes = new HashMap<String,UriRef>();
-    static {
-        entityTypes.put("person", OntologicalClasses.DBPEDIA_PERSON);
-        entityTypes.put("location", OntologicalClasses.DBPEDIA_PLACE);
-        entityTypes.put("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
-    }
     
-    private OpenNLP openNLP;
-
-    private final String defaultLang;
-
-    private final Set<String> processedLangs;
+    protected OpenNLP openNLP;
+    
+    protected NEREngineConfig config;
     
     /** Comments about our models */
     public static final Map<String, String> DATA_FILE_COMMENTS;
@@ -101,28 +97,28 @@ public class NEREngineCore implements En
         DATA_FILE_COMMENTS = new HashMap<String, String>();
         DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
     }
-
-    public NEREngineCore(OpenNLP openNLP, String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException{
+    /**
+     * If used sub classes MUST ensure that {@link #openNLP} and {@link #config}
+     * are set before calling {@link #canEnhance(ContentItem)} or
+     * {@link #computeEnhancements(ContentItem)}
+     */
+    protected NEREngineCore(){}
+    
+    NEREngineCore(OpenNLP openNLP, NEREngineConfig config) throws InvalidFormatException, IOException{
+        if(openNLP == null){
+            throw new IllegalArgumentException("The parsed OpenNLP instance MUST NOT be NULL!");
+        }
+        if(config == null){
+            throw new IllegalArgumentException("The parsed NER engine configuration MUST NOT be NULL!");
+        }
         this.openNLP = openNLP;
-        this.defaultLang = defaultLanguage;
-        this.processedLangs = Collections.unmodifiableSet(processedLanguages);
+        this.config = config;
     }
     
-    NEREngineCore(DataFileProvider dfp,String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException {
-        this(new OpenNLP(dfp),defaultLanguage,processedLanguages);
+    NEREngineCore(DataFileProvider dfp,NEREngineConfig config) throws InvalidFormatException, IOException {
+        this(new OpenNLP(dfp),config);
     }
 
-//    protected TokenNameFinderModel buildNameModel(String name, UriRef typeUri) throws IOException {
-//        //String modelRelativePath = String.format("en-ner-%s.bin", name);
-//        TokenNameFinderModel model = openNLP.getNameModel(name, "en");
-//        // register the name finder instances for matching owl class
-////        entityTypes.put(name, new Object[] {typeUri, model});
-//        return model;
-//    }
-    @Override
-    public String getName() {
-        return getClass().getName();
-    }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
         //first check the langauge before processing the content (text)
@@ -133,10 +129,9 @@ public class NEREngineCore implements En
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
-        if(!isProcessedLangage(language)){
-            throw new IllegalStateException("The language '"+language+"' of ContentItem "+ci.getUri() 
-                + " is not configured to be processed by this NER engine instance "
-                + "(processed "+processedLangs+"): This is also checked in the canEnhance "
+        if(!isNerModel(language)){
+            throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri() 
+                + " no NER model is configured: This is also checked in the canEnhance "
                 + "method! -> This indicated an Bug in the implementation of the "
                 + "EnhancementJobManager!");
         }
@@ -167,14 +162,29 @@ public class NEREngineCore implements En
             new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(), 
                          StringUtils.abbreviate(text, 100)});
         try {
-            for (Map.Entry<String,UriRef> type : entityTypes.entrySet()) {
-                String typeLabel = type.getKey();
-                UriRef typeUri = type.getValue();
-                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(typeLabel, language);
-                if(nameFinderModel == null){
-                    log.info("No NER Model for {} and language {} available!",typeLabel,language);
-                } else {
-                    findNamedEntities(ci, text, language, typeUri, typeLabel, nameFinderModel);
+            if(config.isProcessedLangage(language)){
+                for (String defaultModelType : config.getDefaultModelTypes()) {
+                    TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
+                    if(nameFinderModel == null){
+                        log.info("No NER Model for {} and language {} available!",defaultModelType,language);
+                    } else {
+                        findNamedEntities(ci, text, language, nameFinderModel);
+                    }
+                }
+            } //else do not use default models for languages other than the processed one
+            //process for additional models
+            for(String additionalModel : config.getSpecificNerModles(language)){
+                TokenNameFinderModel nameFinderModel;
+                try {
+                    nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, 
+                        additionalModel, null);
+                    findNamedEntities(ci, text, language, nameFinderModel);
+                } catch (IOException e) {
+                    log.warn("Unable to load TokenNameFinderModel model for language '"+language
+                        + "' (model: "+additionalModel+")",e);
+                } catch (RuntimeException e){
+                    log.warn("Error while creating ChunkerModel for language '"+language
+                        + "' (model: "+additionalModel+")",e);
                 }
             }
         } catch (Exception e) {
@@ -189,8 +199,6 @@ public class NEREngineCore implements En
     protected void findNamedEntities(final ContentItem ci,
                                      final String text,
                                      final String lang,
-                                     final UriRef typeUri,
-                                     final String typeLabel,
                                      final TokenNameFinderModel nameFinderModel) {
 
         if (ci == null) {
@@ -206,8 +214,10 @@ public class NEREngineCore implements En
         } else {
             language = null;
         }
-        log.debug("findNamedEntities typeUri={}, type={}, text=", 
-                new Object[]{ typeUri, typeLabel, StringUtils.abbreviate(text, 100) });
+        if(log.isDebugEnabled()){
+            log.debug("findNamedEntities model={},  language={}, text=", 
+                    new Object[]{ nameFinderModel, language, StringUtils.abbreviate(text, 100) });
+        }
         LiteralFactory literalFactory = LiteralFactory.getInstance();
         MGraph g = ci.getMetadata();
         Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
@@ -228,7 +238,9 @@ public class NEREngineCore implements En
                         new PlainLiteralImpl(name, language)));
                     g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, 
                         new PlainLiteralImpl(occurrence.context, language)));
-                    g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+                    if(occurrence.type != null){
+                        g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
+                    }
                     g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
                             .createTypedLiteral(occurrence.confidence)));
                     if (occurrence.start != null && occurrence.end != null) {
@@ -388,10 +400,10 @@ public class NEREngineCore implements En
             String[] tokens = Span.spansToStrings(tokenSpans, sentence);
             Span[] nameSpans = finder.find(tokens);
             double[] probs = finder.probs();
-            String[] names = Span.spansToStrings(nameSpans, tokens);
             //int lastStartPosition = 0;
-            for (int j = 0; j < names.length; j++) {
-                String name = names[j];
+            for (int j = 0; j < nameSpans.length; j++) {
+                String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(), 
+                    tokenSpans[nameSpans[j].getEnd()-1].getEnd());
                 Double confidence = 1.0;
                 for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
                     confidence *= probs[k];
@@ -399,8 +411,9 @@ public class NEREngineCore implements En
                 int start = tokenSpans[nameSpans[j].getStart()].getStart();
                 int absoluteStart = sentenceSpans[i].getStart() + start;
                 int absoluteEnd = absoluteStart + name.length();
-                NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
-                        confidence);
+                UriRef mappedType = config.getMappedType(nameSpans[j].getType());
+                NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, 
+                    mappedType, context, confidence);
 
                 List<NameOccurrence> occurrences = nameOccurrences.get(name);
                 if (occurrences == null) {
@@ -416,11 +429,12 @@ public class NEREngineCore implements En
     }
 
     public int canEnhance(ContentItem ci) {
-        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null 
-                && isProcessedLangage(extractLanguage(ci))){
-                return ENHANCE_ASYNC; //The NER engine now supports Async processing!
+        if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
+                isNerModel(extractLanguage(ci))){
+            return ENHANCE_ASYNC;
+        } else {
+            return CANNOT_ENHANCE;
         }
-        return CANNOT_ENHANCE;
     }
 
     /**
@@ -445,38 +459,6 @@ public class NEREngineCore implements En
     }
 
     /**
-     * The default language
-     * @return the defaultLang
-     */
-    public String getDefaultLanguage() {
-        return defaultLang;
-    }
-    /**
-     * Checks if the parsed language is enabled for processing.
-     * If <code>null</code> is parsed as language this returns <code>false</code>
-     * even if processing of all languages is enabled. <p>
-     * NOTE: If this Method returns <code>true</code> this does
-     * not mean that text with this language can be actually processed because this
-     * also requires that the NER model for this language are available via the
-     * parsed {@link OpenNLP} instance.
-     * @param lang the language
-     * @return the state
-     */
-    public boolean isProcessedLangage(String lang){
-        return lang != null && (processedLangs.isEmpty() || processedLangs.contains(lang));
-    }
-    /*
-     * The following Utility extracts the language from the metadata of the
-     * parsed Content Item.
-     * This Utility is actually a copy of the same form the KeywordExtractionEngine.
-     * TODO: change this to a global Utility as soon as STANBOL Enhancement
-     * Structure is defined
-     */
-    /**
-     * The literal representing the LangIDEngine as creator.
-     */
-    public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
-    /**
      * Extracts the language of the parsed ContentItem by using
      * {@link EnhancementEngineHelper#getLanguage(ContentItem)} and 
      * {@link #defaultLang} as default
@@ -485,26 +467,26 @@ public class NEREngineCore implements En
      */
     private String extractLanguage(ContentItem ci) {
         String lang = EnhancementEngineHelper.getLanguage(ci);
-//        MGraph metadata = ci.getMetadata();
-//        Iterator<Triple> langaugeEnhancementCreatorTriples = 
-//            metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
-//        if(langaugeEnhancementCreatorTriples.hasNext()){
-//            String lang = EnhancementEngineHelper.getString(metadata, 
-//                langaugeEnhancementCreatorTriples.next().getSubject(), 
-//                Properties.DC_LANGUAGE);
         if(lang != null){
             return lang;
         } else {
-            log.info("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
-                new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
-            log.info(" ... return '{}' as default",defaultLang);
-            return defaultLang;
-        }
-//        } else {
-//            log.info("Unable to extract language for ContentItem {}! Is the {} active?",
-//                ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
-//            log.info(" ... return '{}' as default",defaultLang);
-//            return defaultLang;
-//        }
+            log.info("Unable to extract language for ContentItem %s!",ci.getUri().getUnicodeString());
+            log.info(" ... return '{}' as default",config.getDefaultLanguage());
+            return config.getDefaultLanguage();
+        }
+    }
+    /**
+     * This Method checks if this configuration does have a NER model for the
+     * parsed language. This checks if the pased language 
+     * {@link #isProcessedLangage(String)} and any {@link #getDefaultModelTypes()}
+     * is present OR if any {@link #getSpecificNerModles(String)} is configured for the
+     * parsed language.
+     * @param lang The language to check
+     * @return if there is any NER model configured for the parsed language
+     */
+    public boolean isNerModel(String lang){
+        return (config.isProcessedLangage(lang) && !config.getDefaultModelTypes().isEmpty()) ||
+               !config.getSpecificNerModles(lang).isEmpty();
+                
     }
 }

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java Mon Nov  5 10:22:01 2012
@@ -16,6 +16,8 @@
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;
 
+import org.apache.clerezza.rdf.core.UriRef;
+
 public class NameOccurrence {
 
     public final String name;
@@ -28,11 +30,14 @@ public class NameOccurrence {
 
     public final Double confidence;
 
-    public NameOccurrence(String name, Integer start, Integer end,
+    public final UriRef type;
+
+    public NameOccurrence(String name, Integer start, Integer end, UriRef type,
             String context, Double confidence) {
         this.start = start;
         this.end = end;
         this.name = name;
+        this.type = type;
         this.context = context;
         this.confidence = confidence;
     }
@@ -40,8 +45,8 @@ public class NameOccurrence {
     @Override
     public String toString() {
         return String.format(
-                "[name='%s', start='%d', end='%d', confidence='%f', context='%s']",
-                name, start, end, confidence, context);
+                "[name='%s', start='%d', end='%d', type='%s', confidence='%f', context='%s']",
+                name, start, end, type, confidence, context);
     }
 
 }

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java Mon Nov  5 10:22:01 2012
@@ -20,23 +20,19 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.Map;
-import java.util.Set;
 
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.ConfigurationPolicy;
 import org.apache.felix.scr.annotations.Property;
 import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
 import org.apache.felix.scr.annotations.Service;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
-import org.apache.stanbol.enhancer.servicesapi.ContentItem;
-import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.osgi.framework.Constants;
-import org.osgi.framework.ServiceRegistration;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 
@@ -49,7 +45,7 @@ import org.osgi.service.component.Compon
     immediate = true,
     inherit = true,
     configurationFactory = true, 
-    policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
+    policy = ConfigurationPolicy.OPTIONAL,
     specVersion = "1.1", 
     label = "%stanbol.NamedEntityExtractionEnhancementEngine.name", 
     description = "%stanbol.NamedEntityExtractionEnhancementEngine.description")
@@ -58,14 +54,16 @@ import org.osgi.service.component.Compon
     @Property(name=EnhancementEngine.PROPERTY_NAME,value="ner"),
     @Property(name=NamedEntityExtractionEnhancementEngine.PROCESSED_LANGUAGES,value=""),
     @Property(name=NamedEntityExtractionEnhancementEngine.DEFAULT_LANGUAGE,value=""),
-    @Property(name=Constants.SERVICE_RANKING,intValue=0)
+    //set the ranking of the default config to a negative value (ConfigurationPolicy.OPTIONAL) 
+    @Property(name=Constants.SERVICE_RANKING,intValue=-100) 
 })
+@Reference(name="openNLP",referenceInterface=OpenNLP.class, 
+    cardinality=ReferenceCardinality.MANDATORY_UNARY,
+    policy=ReferencePolicy.STATIC)
 public class NamedEntityExtractionEnhancementEngine 
-        extends AbstractEnhancementEngine<IOException,RuntimeException> 
+        extends NEREngineCore
         implements EnhancementEngine, ServiceProperties {
 
-    private EnhancementEngine engineCore;
-    
     public static final String DEFAULT_DATA_OPEN_NLP_MODEL_LOCATION = "org/apache/stanbol/defaultdata/opennlp";
 
     /**
@@ -89,61 +87,60 @@ public class NamedEntityExtractionEnhanc
      * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
      */
     public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
-    private ServiceRegistration dfpServiceRegistration;
-    
-    @Reference
-    private OpenNLP openNLP;
+    /**
+     * Bind method of {@link NEREngineCore#openNLP}
+     * @param openNlp
+     */
+    protected void bindOpenNLP(OpenNLP openNlp){
+        this.openNLP = openNlp;
+    }
+    /**
+     * Unbind method of {@link NEREngineCore#openNLP}
+     * @param openNLP
+     */
+    protected void unbindOpenNLP(OpenNLP openNLP){
+        this.openNLP = null;
+    }
     
     protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
         super.activate(ctx);
+        config = new NEREngineConfig();
         // Need to register the default data before loading the models
         Object value = ctx.getProperties().get(DEFAULT_LANGUAGE);
-        final String defaultLanguage;
         if(value != null && !value.toString().isEmpty()){
-            defaultLanguage = value.toString();
-        } else {
-            defaultLanguage = null;
-        }
+            config.setDefaultLanguage(value.toString());
+        } //else no default language
+        
         value = ctx.getProperties().get(PROCESSED_LANGUAGES);
-        final Set<String> processedLanguages;
         if(value instanceof String[]){
-            processedLanguages = new HashSet<String>(Arrays.asList((String[]) value));
-            processedLanguages.remove(null); //remove null
-            processedLanguages.remove(""); //remove empty
+            config.getProcessedLanguages().addAll(Arrays.asList((String[]) value));
+            config.getProcessedLanguages().remove(null); //remove null
+            config.getProcessedLanguages().remove(""); //remove empty
         } else if (value instanceof Collection<?>){
-            processedLanguages = new HashSet<String>();
             for(Object o : ((Collection<?>)value)){
                 if(o != null){
-                    processedLanguages.add(o.toString());
+                    config.getProcessedLanguages().add(o.toString());
                 }
             }
-            processedLanguages.remove(""); //remove empty
+            config.getProcessedLanguages().remove(""); //remove empty
         } else if(value != null && !value.toString().isEmpty()){
             //if a single String is parsed we support ',' as seperator
             String[] languageArray = value.toString().split(",");
-            processedLanguages = new HashSet<String>(Arrays.asList(languageArray));
-            processedLanguages.remove(null); //remove null
-            processedLanguages.remove(""); //remove empty
-        } else { //no configuration
-            processedLanguages = Collections.emptySet();
-        }
-        if(!processedLanguages.isEmpty() && defaultLanguage != null &&
-                !processedLanguages.contains(defaultLanguage)){
+            config.getProcessedLanguages().addAll(Arrays.asList(languageArray));
+            config.getProcessedLanguages().remove(null); //remove null
+            config.getProcessedLanguages().remove(""); //remove empty
+        } //else no configuration
+        if(!config.getProcessedLanguages().isEmpty() && config.getDefaultLanguage() != null &&
+                !config.getProcessedLanguages().contains(config.getDefaultLanguage())){
             throw new ConfigurationException(PROCESSED_LANGUAGES, "The list of" +
-            		"processed Languages "+processedLanguages+" MUST CONTAIN the" +
-            		"configured default language '"+defaultLanguage+"'!");
+            		"processed Languages "+config.getProcessedLanguages()+" MUST CONTAIN the" +
+            		"configured default language '"+config.getDefaultLanguage()+"'!");
         }
-        engineCore = new NEREngineCore(openNLP, defaultLanguage, processedLanguages);
     }
 
     protected void deactivate(ComponentContext ctx) {
+        config = null;
         super.deactivate(ctx);
-        if(dfpServiceRegistration != null) {
-            dfpServiceRegistration.unregister();
-            dfpServiceRegistration = null;
-        }
-        engineCore = null;
     }
     
     @Override
@@ -152,22 +149,22 @@ public class NamedEntityExtractionEnhanc
             (Object) defaultOrder));
     }
 
-    @Override
-    public int canEnhance(ContentItem ci) throws EngineException {
-        checkCore();
-        return engineCore.canEnhance(ci);
-    }
-
-    @Override
-    public void computeEnhancements(ContentItem ci) throws EngineException {
-        checkCore();
-        engineCore.computeEnhancements(ci);
-    }
+//    @Override
+//    public int canEnhance(ContentItem ci) throws EngineException {
+//        checkCore();
+//        return engineCore.canEnhance(ci);
+//    }
+
+//    @Override
+//    public void computeEnhancements(ContentItem ci) throws EngineException {
+//        checkCore();
+//        engineCore.computeEnhancements(ci);
+//    }
     
-    private void checkCore() {
-        if(engineCore == null) {
-            throw new IllegalStateException("EngineCore not initialized");
-        }
-    }
+//    private void checkCore() {
+//        if(engineCore == null) {
+//            throw new IllegalStateException("EngineCore not initialized");
+//        }
+//    }
 
 }
\ No newline at end of file

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Nov  5 10:22:01 2012
@@ -43,4 +43,16 @@ An empty text indicates that all languag
 (e.g. 'en,de' to enhance only English and German texts). \
 NOTE: This porperty can be used to configure multiple instances of this engine that \
 process only documents with specific languages. This might e.g. be useful to \
-enable/disable NER for specific languages.
\ No newline at end of file
+enable/disable NER for specific languages.
+
+stanbol.CustomNERModelEnhancementEngine.name=Apache Stanbol Enhancer Engine: Custom NER Model
+stanbol.CustomNERModelEnhancementEngine.description=NER Engine that allows to configure custom \
+OpenNLP NameFinder modles for arbitrary Named Entity types
+stanbol.engines.opennlp-ner.typeMappings.name=Type Mappings
+stanbol.engines.opennlp-ner.typeMappings.description="{named-entity-type} > {uri}" mappings \
+for the Named Entity Types recognized by any of the configured NER models to the URIs used \
+as values for the dc:type property for the generated fise:TextAnnotations. NOTE: that \
+TextAnnotations for unmapped Named Entity Types will have no dc:type information.
+stanbol.engines.opennlp-ner.nameFinderModels.name=Name Finder Models
+stanbol.engines.opennlp-ner.nameFinderModels.description=The list of NER - OpenNLP \
+TokenNameFinderModel's
\ No newline at end of file

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Mon Nov  5 10:22:01 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.stanbol.enhancer.engines.opennlp.impl;
 
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
 
 import java.io.IOException;
@@ -29,6 +30,10 @@ import org.apache.clerezza.rdf.core.Lite
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
 import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
 import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -36,6 +41,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.junit.Assert;
+import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -54,21 +60,38 @@ public class TestNamedEntityExtractionEn
             + " without any name.\n"
             + "A new paragraph is being written. This paragraph has two sentences.";
 
+    
+    public static final String EHEALTH = "Whereas activation of the HIV-1 enhancer following T-cell " 
+            + "stimulation is mediated largely through binding of the transcription factor NF-kappa "
+            + "B to two adjacent kappa B sites in the HIV-1 long terminal repeat, activation of the "
+            + "HIV-2 enhancer in monocytes and T cells is dependent on four cis-acting elements : a "
+            + "single kappa B site, two purine-rich binding sites , PuB1 and PuB2 , and a pets site .";
+    
     private static ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
-    static NEREngineCore nerEngine;
+    private NEREngineCore nerEngine;
     
     public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";
-
-    @SuppressWarnings("unchecked")
+    public static OpenNLP openNLP;
+    
     @BeforeClass
-    public static void setUpServices() throws IOException {
-        nerEngine = new NEREngineCore(new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME),
-            "en",Collections.EMPTY_SET);
+    public static void initDataFileProvicer(){
+        DataFileProvider dataFileProvider = new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME);
+        openNLP = new OpenNLP(dataFileProvider);
+    }
+    
+    @Before
+    public void setUpServices() throws IOException {
+        nerEngine = new NEREngineCore(openNLP,
+            new NEREngineConfig()){};
     }
 
     public static ContentItem wrapAsContentItem(final String id,
-            final String text) throws IOException {
-    	return ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+            final String text, String language) throws IOException {
+    	ContentItem ci =  ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+    	if(language != null){
+    	    ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl(language)));
+    	}
+    	return ci;
     }
 
     @Test
@@ -124,7 +147,7 @@ public class TestNamedEntityExtractionEn
     @Test
     public void testComputeEnhancements()
             throws EngineException, IOException {
-        ContentItem ci = wrapAsContentItem("my doc id", SINGLE_SENTENCE);
+        ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", SINGLE_SENTENCE,"en");
         nerEngine.computeEnhancements(ci);
         Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
         expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
@@ -135,5 +158,26 @@ public class TestNamedEntityExtractionEn
         int textAnnotationCount = validateAllTextAnnotations(g,SINGLE_SENTENCE,expectedValues);
         assertEquals(3, textAnnotationCount);
     }
+    @Test
+    public void testCustomModel() throws EngineException, IOException {
+        ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", EHEALTH,"en");
+        //this test does not use default models
+        nerEngine.config.getDefaultModelTypes().clear(); 
+        //but instead a custom model provided by the test data
+        nerEngine.config.addCustomNameFinderModel("en", "bionlp2004-DNA-en.bin");
+        nerEngine.config.setMappedType("DNA", new UriRef("http://www.bootstrep.eu/ontology/GRO#DNA"));
+        nerEngine.computeEnhancements(ci);
+        Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
+        expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
+        expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(nerEngine.getClass().getName()));
+        //adding null as expected for confidence makes it a required property
+        expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
+        //and dc:type values MUST be the URI set as mapped type
+        expectedValues.put(Properties.DC_TYPE, new UriRef("http://www.bootstrep.eu/ontology/GRO#DNA"));
+        MGraph g = ci.getMetadata();
+        int textAnnotationCount = validateAllTextAnnotations(g,EHEALTH,expectedValues);
+        assertEquals(6, textAnnotationCount);
+    }
+    
 
 }
\ No newline at end of file