You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/05 11:22:02 UTC
svn commit: r1405731 - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner: ./
src/license/ src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
src/main/resources/OSGI-INF/metatype/
src/test/java/org/apache/stanbol/enha...
Author: rwesten
Date: Mon Nov 5 10:22:01 2012
New Revision: 1405731
URL: http://svn.apache.org/viewvc?rev=1405731&view=rev
Log:
STANBOL-792: merged implementation to the stanbol-nlp-processing branch
Added:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/CustomNERModelEnhancementEngine.java
- copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/CustomNERModelEnhancementEngine.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
- copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/
- copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/
- copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/
- copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/
- copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/
- copied from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/LICENSE
- copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/LICENSE
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/README.md
- copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/README.md
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/bionlp2004-DNA-en.bin
- copied unchanged from r1405730, stanbol/trunk/enhancer/engines/opennlp-ner/src/test/resources/org/apache/stanbol/data/opennlp/bionlp2004-DNA-en.bin
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/ (props changed)
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Mon Nov 5 10:22:01 2012
@@ -0,0 +1,3 @@
+/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/opennlp-ner:1374978-1386535
+/incubator/stanbol/trunk/enhancer/engines/opennlp-ner:1339554,1339557-1339558,1386989-1388016
+/stanbol/trunk/enhancer/engines/opennlp-ner:1388017-1405730
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml Mon Nov 5 10:22:01 2012
@@ -38,12 +38,12 @@
<scm>
<connection>
- scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-ner/
</connection>
<developerConnection>
- scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/opennlp-ner/
</developerConnection>
- <url>http://incubator.apache.org/stanbol/</url>
+ <url>http://stanbol.apache.org/</url>
</scm>
@@ -60,6 +60,8 @@
</Private-Package>
<Import-Package>
!net.didion.*,
+ org.apache.stanbol.enhancer.servicesapi; provide:=true,
+ org.apache.stanbol.enhancer.servicesapi.impl; provide:=true,
*
</Import-Package>
</instructions>
@@ -77,7 +79,7 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.commons.opennlp</artifactId>
- <version>0.9.0-incubating</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/license/THIRD-PARTY.properties Mon Nov 5 10:22:01 2012
@@ -1,18 +1,25 @@
# Generated by org.codehaus.mojo.license.AddThirdPartyMojo
#-------------------------------------------------------------------------------
# Already used licenses in project :
-# - Apache License
-# - Common Development and Distribution License (CDDL) v1.0
-# - Common Public License Version 1.0
+# - Apache Software License
+# - Apache Software License, Version 2.0
+# - BSD License
+# - Common Development And Distribution License (CDDL), Version 1.0
+# - Common Development And Distribution License (CDDL), Version 1.1
+# - Common Public License, Version 1.0
+# - Eclipse Public License, Version 1.0
+# - GNU General Public License (GPL), Version 2 with classpath exception
+# - GNU Lesser General Public License (LGPL)
+# - GNU Lesser General Public License (LGPL), Version 2.1
# - ICU License
# - MIT License
-# - The Apache Software License, Version 2.0
+# - Public Domain License
#-------------------------------------------------------------------------------
# Please fill the missing licenses for dependencies :
#
#
-#Wed Feb 15 19:06:02 CET 2012
-javax.servlet--servlet-api--2.4=Common Development And Distribution License (CDDL), Version 1.0
+#Sun Oct 07 16:31:16 CEST 2012
+javax.servlet--servlet-api--2.5=Common Development And Distribution License (CDDL), Version 1.0
jwnl--jwnl--1.3.3=BSD License
org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0
org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Mon Nov 5 10:22:01 2012
@@ -65,15 +65,19 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Core of our EnhancementEngine, separated from the OSGi service to make it easier to test this.
+ * Core of the NER EnhancementEngine(s), separated from the OSGi service to make
+ * it easier to test this.
*/
-public class NEREngineCore implements EnhancementEngine {
+public abstract class NEREngineCore
+ extends AbstractEnhancementEngine<IOException,RuntimeException>
+ implements EnhancementEngine {
protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
/**
* Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE}
@@ -82,18 +86,10 @@ public class NEREngineCore implements En
Collections.singleton(TEXT_PLAIN_MIMETYPE);
private final Logger log = LoggerFactory.getLogger(getClass());
- private static Map<String,UriRef> entityTypes = new HashMap<String,UriRef>();
- static {
- entityTypes.put("person", OntologicalClasses.DBPEDIA_PERSON);
- entityTypes.put("location", OntologicalClasses.DBPEDIA_PLACE);
- entityTypes.put("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
- }
- private OpenNLP openNLP;
-
- private final String defaultLang;
-
- private final Set<String> processedLangs;
+ protected OpenNLP openNLP;
+
+ protected NEREngineConfig config;
/** Comments about our models */
public static final Map<String, String> DATA_FILE_COMMENTS;
@@ -101,28 +97,28 @@ public class NEREngineCore implements En
DATA_FILE_COMMENTS = new HashMap<String, String>();
DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
}
-
- public NEREngineCore(OpenNLP openNLP, String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException{
+ /**
+ * If used sub classes MUST ensure that {@link #openNLP} and {@link #config}
+ * are set before calling {@link #canEnhance(ContentItem)} or
+ * {@link #computeEnhancements(ContentItem)}
+ */
+ protected NEREngineCore(){}
+
+ NEREngineCore(OpenNLP openNLP, NEREngineConfig config) throws InvalidFormatException, IOException{
+ if(openNLP == null){
+ throw new IllegalArgumentException("The parsed OpenNLP instance MUST NOT be NULL!");
+ }
+ if(config == null){
+ throw new IllegalArgumentException("The parsed NER engine configuration MUST NOT be NULL!");
+ }
this.openNLP = openNLP;
- this.defaultLang = defaultLanguage;
- this.processedLangs = Collections.unmodifiableSet(processedLanguages);
+ this.config = config;
}
- NEREngineCore(DataFileProvider dfp,String defaultLanguage, Set<String> processedLanguages) throws InvalidFormatException, IOException {
- this(new OpenNLP(dfp),defaultLanguage,processedLanguages);
+ NEREngineCore(DataFileProvider dfp,NEREngineConfig config) throws InvalidFormatException, IOException {
+ this(new OpenNLP(dfp),config);
}
-// protected TokenNameFinderModel buildNameModel(String name, UriRef typeUri) throws IOException {
-// //String modelRelativePath = String.format("en-ner-%s.bin", name);
-// TokenNameFinderModel model = openNLP.getNameModel(name, "en");
-// // register the name finder instances for matching owl class
-//// entityTypes.put(name, new Object[] {typeUri, model});
-// return model;
-// }
- @Override
- public String getName() {
- return getClass().getName();
- }
public void computeEnhancements(ContentItem ci) throws EngineException {
//first check the langauge before processing the content (text)
@@ -133,10 +129,9 @@ public class NEREngineCore implements En
+ "method! -> This indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
- if(!isProcessedLangage(language)){
- throw new IllegalStateException("The language '"+language+"' of ContentItem "+ci.getUri()
- + " is not configured to be processed by this NER engine instance "
- + "(processed "+processedLangs+"): This is also checked in the canEnhance "
+ if(!isNerModel(language)){
+ throw new IllegalStateException("For the language '"+language+"' of ContentItem "+ci.getUri()
+ + " no NER model is configured: This is also checked in the canEnhance "
+ "method! -> This indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
@@ -167,14 +162,29 @@ public class NEREngineCore implements En
new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
StringUtils.abbreviate(text, 100)});
try {
- for (Map.Entry<String,UriRef> type : entityTypes.entrySet()) {
- String typeLabel = type.getKey();
- UriRef typeUri = type.getValue();
- TokenNameFinderModel nameFinderModel = openNLP.getNameModel(typeLabel, language);
- if(nameFinderModel == null){
- log.info("No NER Model for {} and language {} available!",typeLabel,language);
- } else {
- findNamedEntities(ci, text, language, typeUri, typeLabel, nameFinderModel);
+ if(config.isProcessedLangage(language)){
+ for (String defaultModelType : config.getDefaultModelTypes()) {
+ TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
+ if(nameFinderModel == null){
+ log.info("No NER Model for {} and language {} available!",defaultModelType,language);
+ } else {
+ findNamedEntities(ci, text, language, nameFinderModel);
+ }
+ }
+ } //else do not use default models for languages other than the processed one
+ //process for additional models
+ for(String additionalModel : config.getSpecificNerModles(language)){
+ TokenNameFinderModel nameFinderModel;
+ try {
+ nameFinderModel = openNLP.getModel(TokenNameFinderModel.class,
+ additionalModel, null);
+ findNamedEntities(ci, text, language, nameFinderModel);
+ } catch (IOException e) {
+ log.warn("Unable to load TokenNameFinderModel model for language '"+language
+ + "' (model: "+additionalModel+")",e);
+ } catch (RuntimeException e){
+ log.warn("Error while creating ChunkerModel for language '"+language
+ + "' (model: "+additionalModel+")",e);
}
}
} catch (Exception e) {
@@ -189,8 +199,6 @@ public class NEREngineCore implements En
protected void findNamedEntities(final ContentItem ci,
final String text,
final String lang,
- final UriRef typeUri,
- final String typeLabel,
final TokenNameFinderModel nameFinderModel) {
if (ci == null) {
@@ -206,8 +214,10 @@ public class NEREngineCore implements En
} else {
language = null;
}
- log.debug("findNamedEntities typeUri={}, type={}, text=",
- new Object[]{ typeUri, typeLabel, StringUtils.abbreviate(text, 100) });
+ if(log.isDebugEnabled()){
+ log.debug("findNamedEntities model={}, language={}, text=",
+ new Object[]{ nameFinderModel, language, StringUtils.abbreviate(text, 100) });
+ }
LiteralFactory literalFactory = LiteralFactory.getInstance();
MGraph g = ci.getMetadata();
Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
@@ -228,7 +238,9 @@ public class NEREngineCore implements En
new PlainLiteralImpl(name, language)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
new PlainLiteralImpl(occurrence.context, language)));
- g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+ if(occurrence.type != null){
+ g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
+ }
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
.createTypedLiteral(occurrence.confidence)));
if (occurrence.start != null && occurrence.end != null) {
@@ -388,10 +400,10 @@ public class NEREngineCore implements En
String[] tokens = Span.spansToStrings(tokenSpans, sentence);
Span[] nameSpans = finder.find(tokens);
double[] probs = finder.probs();
- String[] names = Span.spansToStrings(nameSpans, tokens);
//int lastStartPosition = 0;
- for (int j = 0; j < names.length; j++) {
- String name = names[j];
+ for (int j = 0; j < nameSpans.length; j++) {
+ String name = sentence.substring(tokenSpans[nameSpans[j].getStart()].getStart(),
+ tokenSpans[nameSpans[j].getEnd()-1].getEnd());
Double confidence = 1.0;
for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
confidence *= probs[k];
@@ -399,8 +411,9 @@ public class NEREngineCore implements En
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
- NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
- confidence);
+ UriRef mappedType = config.getMappedType(nameSpans[j].getType());
+ NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd,
+ mappedType, context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {
@@ -416,11 +429,12 @@ public class NEREngineCore implements En
}
public int canEnhance(ContentItem ci) {
- if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null
- && isProcessedLangage(extractLanguage(ci))){
- return ENHANCE_ASYNC; //The NER engine now supports Async processing!
+ if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
+ isNerModel(extractLanguage(ci))){
+ return ENHANCE_ASYNC;
+ } else {
+ return CANNOT_ENHANCE;
}
- return CANNOT_ENHANCE;
}
/**
@@ -445,38 +459,6 @@ public class NEREngineCore implements En
}
/**
- * The default language
- * @return the defaultLang
- */
- public String getDefaultLanguage() {
- return defaultLang;
- }
- /**
- * Checks if the parsed language is enabled for processing.
- * If <code>null</code> is parsed as language this returns <code>false</code>
- * even if processing of all languages is enabled. <p>
- * NOTE: If this Method returns <code>true</code> this does
- * not mean that text with this language can be actually processed because this
- * also requires that the NER model for this language are available via the
- * parsed {@link OpenNLP} instance.
- * @param lang the language
- * @return the state
- */
- public boolean isProcessedLangage(String lang){
- return lang != null && (processedLangs.isEmpty() || processedLangs.contains(lang));
- }
- /*
- * The following Utility extracts the language from the metadata of the
- * parsed Content Item.
- * This Utility is actually a copy of the same form the KeywordExtractionEngine.
- * TODO: change this to a global Utility as soon as STANBOL Enhancement
- * Structure is defined
- */
- /**
- * The literal representing the LangIDEngine as creator.
- */
- public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.langid.LangIdEnhancementEngine");
- /**
* Extracts the language of the parsed ContentItem by using
* {@link EnhancementEngineHelper#getLanguage(ContentItem)} and
* {@link #defaultLang} as default
@@ -485,26 +467,26 @@ public class NEREngineCore implements En
*/
private String extractLanguage(ContentItem ci) {
String lang = EnhancementEngineHelper.getLanguage(ci);
-// MGraph metadata = ci.getMetadata();
-// Iterator<Triple> langaugeEnhancementCreatorTriples =
-// metadata.filter(null, Properties.DC_CREATOR, LANG_ID_ENGINE_NAME);
-// if(langaugeEnhancementCreatorTriples.hasNext()){
-// String lang = EnhancementEngineHelper.getString(metadata,
-// langaugeEnhancementCreatorTriples.next().getSubject(),
-// Properties.DC_LANGUAGE);
if(lang != null){
return lang;
} else {
- log.info("Unable to extract language for ContentItem %s! The Enhancement of the %s is missing the %s property",
- new Object[]{ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm(),Properties.DC_LANGUAGE});
- log.info(" ... return '{}' as default",defaultLang);
- return defaultLang;
- }
-// } else {
-// log.info("Unable to extract language for ContentItem {}! Is the {} active?",
-// ci.getUri().getUnicodeString(),LANG_ID_ENGINE_NAME.getLexicalForm());
-// log.info(" ... return '{}' as default",defaultLang);
-// return defaultLang;
-// }
+ log.info("Unable to extract language for ContentItem %s!",ci.getUri().getUnicodeString());
+ log.info(" ... return '{}' as default",config.getDefaultLanguage());
+ return config.getDefaultLanguage();
+ }
+ }
+ /**
+ * This Method checks if this configuration does have a NER model for the
+ * parsed language. This checks if the pased language
+ * {@link #isProcessedLangage(String)} and any {@link #getDefaultModelTypes()}
+ * is present OR if any {@link #getSpecificNerModles(String)} is configured for the
+ * parsed language.
+ * @param lang The language to check
+ * @return if there is any NER model configured for the parsed language
+ */
+ public boolean isNerModel(String lang){
+ return (config.isProcessedLangage(lang) && !config.getDefaultModelTypes().isEmpty()) ||
+ !config.getSpecificNerModles(lang).isEmpty();
+
}
}
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NameOccurrence.java Mon Nov 5 10:22:01 2012
@@ -16,6 +16,8 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
+import org.apache.clerezza.rdf.core.UriRef;
+
public class NameOccurrence {
public final String name;
@@ -28,11 +30,14 @@ public class NameOccurrence {
public final Double confidence;
- public NameOccurrence(String name, Integer start, Integer end,
+ public final UriRef type;
+
+ public NameOccurrence(String name, Integer start, Integer end, UriRef type,
String context, Double confidence) {
this.start = start;
this.end = end;
this.name = name;
+ this.type = type;
this.context = context;
this.confidence = confidence;
}
@@ -40,8 +45,8 @@ public class NameOccurrence {
@Override
public String toString() {
return String.format(
- "[name='%s', start='%d', end='%d', confidence='%f', context='%s']",
- name, start, end, confidence, context);
+ "[name='%s', start='%d', end='%d', type='%s', confidence='%f', context='%s']",
+ name, start, end, type, confidence, context);
}
}
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java Mon Nov 5 10:22:01 2012
@@ -20,23 +20,19 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
-import java.util.HashSet;
import java.util.Map;
-import java.util.Set;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.ReferenceCardinality;
+import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.opennlp.OpenNLP;
-import org.apache.stanbol.enhancer.servicesapi.ContentItem;
-import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
-import org.osgi.framework.ServiceRegistration;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
@@ -49,7 +45,7 @@ import org.osgi.service.component.Compon
immediate = true,
inherit = true,
configurationFactory = true,
- policy = ConfigurationPolicy.REQUIRE, // the baseUri is required!
+ policy = ConfigurationPolicy.OPTIONAL,
specVersion = "1.1",
label = "%stanbol.NamedEntityExtractionEnhancementEngine.name",
description = "%stanbol.NamedEntityExtractionEnhancementEngine.description")
@@ -58,14 +54,16 @@ import org.osgi.service.component.Compon
@Property(name=EnhancementEngine.PROPERTY_NAME,value="ner"),
@Property(name=NamedEntityExtractionEnhancementEngine.PROCESSED_LANGUAGES,value=""),
@Property(name=NamedEntityExtractionEnhancementEngine.DEFAULT_LANGUAGE,value=""),
- @Property(name=Constants.SERVICE_RANKING,intValue=0)
+ //set the ranking of the default config to a negative value (ConfigurationPolicy.OPTIONAL)
+ @Property(name=Constants.SERVICE_RANKING,intValue=-100)
})
+@Reference(name="openNLP",referenceInterface=OpenNLP.class,
+ cardinality=ReferenceCardinality.MANDATORY_UNARY,
+ policy=ReferencePolicy.STATIC)
public class NamedEntityExtractionEnhancementEngine
- extends AbstractEnhancementEngine<IOException,RuntimeException>
+ extends NEREngineCore
implements EnhancementEngine, ServiceProperties {
- private EnhancementEngine engineCore;
-
public static final String DEFAULT_DATA_OPEN_NLP_MODEL_LOCATION = "org/apache/stanbol/defaultdata/opennlp";
/**
@@ -89,61 +87,60 @@ public class NamedEntityExtractionEnhanc
* {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
- private ServiceRegistration dfpServiceRegistration;
-
- @Reference
- private OpenNLP openNLP;
+ /**
+ * Bind method of {@link NEREngineCore#openNLP}
+ * @param openNlp
+ */
+ protected void bindOpenNLP(OpenNLP openNlp){
+ this.openNLP = openNlp;
+ }
+ /**
+ * Unbind method of {@link NEREngineCore#openNLP}
+ * @param openNLP
+ */
+ protected void unbindOpenNLP(OpenNLP openNLP){
+ this.openNLP = null;
+ }
protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
super.activate(ctx);
+ config = new NEREngineConfig();
// Need to register the default data before loading the models
Object value = ctx.getProperties().get(DEFAULT_LANGUAGE);
- final String defaultLanguage;
if(value != null && !value.toString().isEmpty()){
- defaultLanguage = value.toString();
- } else {
- defaultLanguage = null;
- }
+ config.setDefaultLanguage(value.toString());
+ } //else no default language
+
value = ctx.getProperties().get(PROCESSED_LANGUAGES);
- final Set<String> processedLanguages;
if(value instanceof String[]){
- processedLanguages = new HashSet<String>(Arrays.asList((String[]) value));
- processedLanguages.remove(null); //remove null
- processedLanguages.remove(""); //remove empty
+ config.getProcessedLanguages().addAll(Arrays.asList((String[]) value));
+ config.getProcessedLanguages().remove(null); //remove null
+ config.getProcessedLanguages().remove(""); //remove empty
} else if (value instanceof Collection<?>){
- processedLanguages = new HashSet<String>();
for(Object o : ((Collection<?>)value)){
if(o != null){
- processedLanguages.add(o.toString());
+ config.getProcessedLanguages().add(o.toString());
}
}
- processedLanguages.remove(""); //remove empty
+ config.getProcessedLanguages().remove(""); //remove empty
} else if(value != null && !value.toString().isEmpty()){
//if a single String is parsed we support ',' as seperator
String[] languageArray = value.toString().split(",");
- processedLanguages = new HashSet<String>(Arrays.asList(languageArray));
- processedLanguages.remove(null); //remove null
- processedLanguages.remove(""); //remove empty
- } else { //no configuration
- processedLanguages = Collections.emptySet();
- }
- if(!processedLanguages.isEmpty() && defaultLanguage != null &&
- !processedLanguages.contains(defaultLanguage)){
+ config.getProcessedLanguages().addAll(Arrays.asList(languageArray));
+ config.getProcessedLanguages().remove(null); //remove null
+ config.getProcessedLanguages().remove(""); //remove empty
+ } //else no configuration
+ if(!config.getProcessedLanguages().isEmpty() && config.getDefaultLanguage() != null &&
+ !config.getProcessedLanguages().contains(config.getDefaultLanguage())){
throw new ConfigurationException(PROCESSED_LANGUAGES, "The list of" +
- "processed Languages "+processedLanguages+" MUST CONTAIN the" +
- "configured default language '"+defaultLanguage+"'!");
+ "processed Languages "+config.getProcessedLanguages()+" MUST CONTAIN the" +
+ "configured default language '"+config.getDefaultLanguage()+"'!");
}
- engineCore = new NEREngineCore(openNLP, defaultLanguage, processedLanguages);
}
protected void deactivate(ComponentContext ctx) {
+ config = null;
super.deactivate(ctx);
- if(dfpServiceRegistration != null) {
- dfpServiceRegistration.unregister();
- dfpServiceRegistration = null;
- }
- engineCore = null;
}
@Override
@@ -152,22 +149,22 @@ public class NamedEntityExtractionEnhanc
(Object) defaultOrder));
}
- @Override
- public int canEnhance(ContentItem ci) throws EngineException {
- checkCore();
- return engineCore.canEnhance(ci);
- }
-
- @Override
- public void computeEnhancements(ContentItem ci) throws EngineException {
- checkCore();
- engineCore.computeEnhancements(ci);
- }
+// @Override
+// public int canEnhance(ContentItem ci) throws EngineException {
+// checkCore();
+// return engineCore.canEnhance(ci);
+// }
+
+// @Override
+// public void computeEnhancements(ContentItem ci) throws EngineException {
+// checkCore();
+// engineCore.computeEnhancements(ci);
+// }
- private void checkCore() {
- if(engineCore == null) {
- throw new IllegalStateException("EngineCore not initialized");
- }
- }
+// private void checkCore() {
+// if(engineCore == null) {
+// throw new IllegalStateException("EngineCore not initialized");
+// }
+// }
}
\ No newline at end of file
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Nov 5 10:22:01 2012
@@ -43,4 +43,16 @@ An empty text indicates that all languag
(e.g. 'en,de' to enhance only English and German texts). \
NOTE: This porperty can be used to configure multiple instances of this engine that \
process only documents with specific languages. This might e.g. be useful to \
-enable/disable NER for specific languages.
\ No newline at end of file
+enable/disable NER for specific languages.
+
+stanbol.CustomNERModelEnhancementEngine.name=Apache Stanbol Enhancer Engine: Custom NER Model
+stanbol.CustomNERModelEnhancementEngine.description=NER Engine that allows to configure custom \
+OpenNLP NameFinder modles for arbitrary Named Entity types
+stanbol.engines.opennlp-ner.typeMappings.name=Type Mappings
+stanbol.engines.opennlp-ner.typeMappings.description="{named-entity-type} > {uri}" mappings \
+for the Named Entity Types recognized by any of the configured NER models to the URIs used \
+as values for the dc:type property for the generated fise:TextAnnotations. NOTE: that \
+TextAnnotations for unmapped Named Entity Types will have no dc:type information.
+stanbol.engines.opennlp-ner.nameFinderModels.name=Name Finder Models
+stanbol.engines.opennlp-ner.nameFinderModels.description=The list of NER - OpenNLP \
+TokenNameFinderModel's
\ No newline at end of file
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1405731&r1=1405730&r2=1405731&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Mon Nov 5 10:22:01 2012
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations;
import java.io.IOException;
@@ -29,6 +30,10 @@ import org.apache.clerezza.rdf.core.Lite
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
@@ -36,6 +41,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.junit.Assert;
+import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -54,21 +60,38 @@ public class TestNamedEntityExtractionEn
+ " without any name.\n"
+ "A new paragraph is being written. This paragraph has two sentences.";
+
+ public static final String EHEALTH = "Whereas activation of the HIV-1 enhancer following T-cell "
+ + "stimulation is mediated largely through binding of the transcription factor NF-kappa "
+ + "B to two adjacent kappa B sites in the HIV-1 long terminal repeat, activation of the "
+ + "HIV-2 enhancer in monocytes and T cells is dependent on four cis-acting elements : a "
+ + "single kappa B site, two purine-rich binding sites , PuB1 and PuB2 , and a pets site .";
+
private static ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
- static NEREngineCore nerEngine;
+ private NEREngineCore nerEngine;
public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";
-
- @SuppressWarnings("unchecked")
+ public static OpenNLP openNLP;
+
@BeforeClass
- public static void setUpServices() throws IOException {
- nerEngine = new NEREngineCore(new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME),
- "en",Collections.EMPTY_SET);
+ public static void initDataFileProvicer(){
+ DataFileProvider dataFileProvider = new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME);
+ openNLP = new OpenNLP(dataFileProvider);
+ }
+
+ @Before
+ public void setUpServices() throws IOException {
+ nerEngine = new NEREngineCore(openNLP,
+ new NEREngineConfig()){};
}
public static ContentItem wrapAsContentItem(final String id,
- final String text) throws IOException {
- return ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+ final String text, String language) throws IOException {
+ ContentItem ci = ciFactory.createContentItem(new UriRef(id),new StringSource(text));
+ if(language != null){
+ ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl(language)));
+ }
+ return ci;
}
@Test
@@ -124,7 +147,7 @@ public class TestNamedEntityExtractionEn
@Test
public void testComputeEnhancements()
throws EngineException, IOException {
- ContentItem ci = wrapAsContentItem("my doc id", SINGLE_SENTENCE);
+ ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", SINGLE_SENTENCE,"en");
nerEngine.computeEnhancements(ci);
Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
@@ -135,5 +158,26 @@ public class TestNamedEntityExtractionEn
int textAnnotationCount = validateAllTextAnnotations(g,SINGLE_SENTENCE,expectedValues);
assertEquals(3, textAnnotationCount);
}
+ @Test
+ public void testCustomModel() throws EngineException, IOException {
+ ContentItem ci = wrapAsContentItem("urn:test:content-item:single:sentence", EHEALTH,"en");
+ //this test does not use default models
+ nerEngine.config.getDefaultModelTypes().clear();
+ //but instead a custom model provided by the test data
+ nerEngine.config.addCustomNameFinderModel("en", "bionlp2004-DNA-en.bin");
+ nerEngine.config.setMappedType("DNA", new UriRef("http://www.bootstrep.eu/ontology/GRO#DNA"));
+ nerEngine.computeEnhancements(ci);
+ Map<UriRef,Resource> expectedValues = new HashMap<UriRef,Resource>();
+ expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
+ expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(nerEngine.getClass().getName()));
+ //adding null as expected for confidence makes it a required property
+ expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
+ //and dc:type values MUST be the URI set as mapped type
+ expectedValues.put(Properties.DC_TYPE, new UriRef("http://www.bootstrep.eu/ontology/GRO#DNA"));
+ MGraph g = ci.getMetadata();
+ int textAnnotationCount = validateAllTextAnnotations(g,EHEALTH,expectedValues);
+ assertEquals(6, textAnnotationCount);
+ }
+
}
\ No newline at end of file