You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by bd...@apache.org on 2011/04/06 15:40:06 UTC
svn commit: r1089451 - in /incubator/stanbol/trunk/enhancer:
engines/opennlp-ner/
engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
in...
Author: bdelacretaz
Date: Wed Apr 6 13:40:05 2011
New Revision: 1089451
URL: http://svn.apache.org/viewvc?rev=1089451&view=rev
Log:
STANBOL-146 - opennlp engine uses DataFileProvider
Added:
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java (with props)
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java (with props)
Removed:
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/MockComponentContext.java
Modified:
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
incubator/stanbol/trunk/enhancer/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/StatelessEngineTest.java
incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml
incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml
Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml Wed Apr 6 13:40:05 2011
@@ -69,6 +69,12 @@
</dependency>
<dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
+ <version>0.9-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
<groupId>org.apache.clerezza</groupId>
<artifactId>org.apache.clerezza.rdf.core</artifactId>
</dependency>
Added: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java?rev=1089451&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java Wed Apr 6 13:40:05 2011
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.opennlp.impl;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+
+/** DataFileProvider that looks in our class resources */
+public class ClasspathDataFileProvider implements DataFileProvider {
+
+ @Override
+ public InputStream getInputStream(String bundleSymbolicName,
+ String filename, String downloadExplanation)
+ throws IOException {
+ // load default OpenNLP models from classpath (embedded in the defaultdata bundle)
+ final String resourcePath = "org/apache/stanbol/defaultdata/opennlp/" + filename;
+ final InputStream in = getClass().getClassLoader().getResourceAsStream(resourcePath);
+ if (in == null) {
+ throw new IOException("Resource not found in my classpath: " + resourcePath);
+ }
+ return in;
+ }
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ClasspathDataFileProvider.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision Rev URL
Added: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java?rev=1089451&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java Wed Apr 6 13:40:05 2011
@@ -0,0 +1,315 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.opennlp.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.Span;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+
+/** Core of our EnhancementEngine, separated from the OSGi service
+ * to make it easier to test this.
+ */
+public class EngineCore implements EnhancementEngine {
+ protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+ public static final Log log = LogFactory.getLog(NamedEntityExtractionEnhancementEngine.class);
+ private final DataFileProvider dataFileProvider;
+ private final String bundleSymbolicName;
+ protected final SentenceModel sentenceModel;
+ protected final TokenNameFinderModel personNameModel;
+ protected final TokenNameFinderModel locationNameModel;
+ protected final TokenNameFinderModel organizationNameModel;
+ protected Map<String,Object[]> entityTypes = new HashMap<String,Object[]>();
+
+ /** Explain where to get more models */
+ public static final String MODEL_DOWNLOAD_EXPLANATION = "Ask on the Apache Stanbol mailing list";
+
+ EngineCore(DataFileProvider dfp, String bundleSymbolicName) throws InvalidFormatException, IOException {
+ dataFileProvider = dfp;
+ this.bundleSymbolicName = bundleSymbolicName;
+ sentenceModel = new SentenceModel(lookupModelStream("en-sent.bin"));
+ personNameModel = buildNameModel("person", OntologicalClasses.DBPEDIA_PERSON);
+ locationNameModel = buildNameModel("location", OntologicalClasses.DBPEDIA_PLACE);
+ organizationNameModel = buildNameModel("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+ }
+
+ protected InputStream lookupModelStream(String modelRelativePath) throws IOException {
+ return dataFileProvider.getInputStream(bundleSymbolicName, modelRelativePath, MODEL_DOWNLOAD_EXPLANATION);
+ }
+
+ protected TokenNameFinderModel buildNameModel(String name, UriRef typeUri) throws IOException {
+ String modelRelativePath = String.format("en-ner-%s.bin", name);
+ TokenNameFinderModel model = new TokenNameFinderModel(lookupModelStream(modelRelativePath));
+ // register the name finder instances for matching owl class
+ entityTypes.put(name, new Object[] {typeUri, model});
+ return model;
+ }
+
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ String text;
+ try {
+ text = IOUtils.toString(ci.getStream(), "UTF-8");
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ // TODO: make the length of the data a field of the ContentItem
+ // interface to be able to filter out empty items in the canEnhance
+ // method
+ log.warn("nothing to extract knowledge from");
+ return;
+ }
+
+ try {
+ for (Map.Entry<String,Object[]> type : entityTypes.entrySet()) {
+ String typeLabel = type.getKey();
+ Object[] typeInfo = type.getValue();
+ UriRef typeUri = (UriRef) typeInfo[0];
+ TokenNameFinderModel nameFinderModel = (TokenNameFinderModel) typeInfo[1];
+ findNamedEntities(ci, text, typeUri, typeLabel, nameFinderModel);
+ }
+ } catch (Exception e) { // TODO: makes it sense to catch Exception here?
+ throw new EngineException(this, ci, e);
+ }
+ }
+
+ protected void findNamedEntities(final ContentItem ci,
+ final String text,
+ final UriRef typeUri,
+ final String typeLabel,
+ final TokenNameFinderModel nameFinderModel) {
+
+ if (ci == null) {
+ throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
+ }
+ if (text == null) {
+ log.warn("NULL was parsed as text for content item " + ci.getId() + "! -> call ignored");
+ return;
+ }
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+ MGraph g = ci.getMetadata();
+ Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
+
+ Map<String,UriRef> previousAnnotations = new LinkedHashMap<String,UriRef>();
+ for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
+
+ String name = nameInContext.getKey();
+ List<NameOccurrence> occurrences = nameInContext.getValue();
+
+ UriRef firstOccurrenceAnnotation = null;
+
+ for (NameOccurrence occurrence : occurrences) {
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory
+ .createTypedLiteral(name)));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory
+ .createTypedLiteral(occurrence.context)));
+ g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
+ .createTypedLiteral(occurrence.confidence)));
+ if (occurrence.start != null && occurrence.end != null) {
+ g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory
+ .createTypedLiteral(occurrence.start)));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory
+ .createTypedLiteral(occurrence.end)));
+ }
+
+ // add the subsumption relationship among occurrences of the same
+ // name
+ if (firstOccurrenceAnnotation == null) {
+ // check already extracted annotations to find a first most
+ // specific occurrence
+ for (Map.Entry<String,UriRef> entry : previousAnnotations.entrySet()) {
+ if (entry.getKey().contains(name)) {
+ // we have found a most specific previous
+ // occurrence, use it as subsumption target
+ firstOccurrenceAnnotation = entry.getValue();
+ g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
+ break;
+ }
+ }
+ if (firstOccurrenceAnnotation == null) {
+ // no most specific previous occurrence, I am the first,
+ // most specific occurrence to be later used as a target
+ firstOccurrenceAnnotation = textAnnotation;
+ previousAnnotations.put(name, textAnnotation);
+ }
+ } else {
+ // I am referring to a most specific first occurrence of the
+ // same name
+ g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
+ }
+ }
+ }
+ }
+
+ public Collection<String> extractPersonNames(String text) {
+ return extractNames(personNameModel, text);
+ }
+
+ public Collection<String> extractLocationNames(String text) {
+ return extractNames(locationNameModel, text);
+ }
+
+ public Collection<String> extractOrganizationNames(String text) {
+ return extractNames(organizationNameModel, text);
+ }
+
+ public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
+ return extractNameOccurrences(personNameModel, text);
+ }
+
+ public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
+ return extractNameOccurrences(locationNameModel, text);
+ }
+
+ public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
+ return extractNameOccurrences(organizationNameModel, text);
+ }
+
+ protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
+ return extractNameOccurrences(nameFinderModel, text).keySet();
+ }
+
+ protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
+ String text) {
+
+ // version with explicit sentence endings to reflect heading / paragraph
+ // structure of an HTML or PDF document converted to text
+ String textWithDots = text.replaceAll("\\n\\n", ".\n");
+
+ SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
+
+ Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
+
+ NameFinderME finder = new NameFinderME(nameFinderModel);
+
+ Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
+ Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ for (int i = 0; i < sentenceSpans.length; i++) {
+ String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
+
+ // build a context by concatenating three sentences to be used for
+ // similarity ranking / disambiguation + contextual snippet in the
+ // extraction structure
+ List<String> contextElements = new ArrayList<String>();
+ if (i > 0) {
+ CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
+ contextElements.add(previousSentence.toString().trim());
+ }
+ contextElements.add(sentence.toString().trim());
+ if (i + 1 < sentenceSpans.length) {
+ CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
+ contextElements.add(nextSentence.toString().trim());
+ }
+ String context = StringUtils.join(contextElements, " ");
+
+ // extract the names in the current sentence and
+ // keep them store them with the current context
+ String[] tokens = tokenizer.tokenize(sentence);
+ Span[] nameSpans = finder.find(tokens);
+ double[] probs = finder.probs();
+ String[] names = Span.spansToStrings(nameSpans, tokens);
+ int lastStartPosition = 0;
+ for (int j = 0; j < names.length; j++) {
+ String name = names[j];
+ Double confidence = 1.0;
+ for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
+ confidence *= probs[k];
+ }
+ int start = sentence.substring(lastStartPosition).indexOf(name);
+ Integer absoluteStart = null;
+ Integer absoluteEnd = null;
+ if (start != -1) {
+ /*
+ * NOTE (rw, issue 19, 20100615) Here we need to set the new start position, by adding the
+ * current start to the lastStartPosion. we need also to use the lastStartPosition to
+ * calculate the start of the element. The old code had not worked if names contains more
+ * than a single element!
+ */
+ lastStartPosition += start;
+ absoluteStart = sentenceSpans[i].getStart() + lastStartPosition;
+ absoluteEnd = absoluteStart + name.length();
+ }
+ NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
+ confidence);
+
+ List<NameOccurrence> occurrences = nameOccurrences.get(name);
+ if (occurrences == null) {
+ occurrences = new ArrayList<NameOccurrence>();
+ }
+ occurrences.add(occurrence);
+ nameOccurrences.put(name, occurrences);
+ }
+ }
+ finder.clearAdaptiveData();
+
+ if (log.isDebugEnabled()) {
+ for (List<NameOccurrence> occurrences : nameOccurrences.values()) {
+ log.debug("Occurrences found: " + StringUtils.join(occurrences, ", "));
+ }
+ }
+ return nameOccurrences;
+ }
+
+ public int canEnhance(ContentItem ci) {
+ // in case text/pain;charSet=UTF8 is parsed
+ String mimeType = ci.getMimeType().split(";", 2)[0];
+ if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
+ return ENHANCE_SYNCHRONOUS;
+ }
+ return CANNOT_ENHANCE;
+ }
+}
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision Rev URL
Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java Wed Apr 6 13:40:05 2011
@@ -16,54 +16,19 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
-
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collection;
import java.util.Collections;
-import java.util.Dictionary;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.List;
import java.util.Map;
-import opennlp.tools.namefind.NameFinderME;
-import opennlp.tools.namefind.TokenNameFinderModel;
-import opennlp.tools.sentdetect.SentenceDetectorME;
-import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
-import opennlp.tools.util.Span;
-
-import org.apache.clerezza.rdf.core.LiteralFactory;
-import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.UriRef;
-import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.apache.felix.scr.annotations.Component;
-import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
-import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
-import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
-import org.osgi.framework.BundleContext;
+import org.osgi.framework.ServiceRegistration;
import org.osgi.service.component.ComponentContext;
/**
@@ -74,305 +39,56 @@ import org.osgi.service.component.Compon
@Service
public class NamedEntityExtractionEnhancementEngine implements EnhancementEngine, ServiceProperties {
+ private EnhancementEngine engineCore;
+
/**
* The default value for the Execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
- protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
-
- @Property
- public static final String MODELS_PATH = "stanbol.opennlp.models.path";
-
- public static final Log log = LogFactory.getLog(NamedEntityExtractionEnhancementEngine.class);
-
- protected SentenceModel sentenceModel;
-
- protected TokenNameFinderModel personNameModel;
-
- protected TokenNameFinderModel locationNameModel;
-
- protected TokenNameFinderModel organizationNameModel;
-
- protected Map<String,Object[]> entityTypes = new HashMap<String,Object[]>();
-
- protected BundleContext bundleContext;
-
- @SuppressWarnings("unchecked")
- protected void activate(ComponentContext ce) throws IOException {
- bundleContext = ce.getBundleContext();
-
- String directoryPath = null;
- if (ce != null) {
- Dictionary<String,String> properties = ce.getProperties();
- directoryPath = properties.get(MODELS_PATH);
- }
- sentenceModel = new SentenceModel(lookupModelStream(directoryPath, "en-sent.bin"));
-
- personNameModel = buildNameModel(directoryPath, "person", OntologicalClasses.DBPEDIA_PERSON);
-
- locationNameModel = buildNameModel(directoryPath, "location", OntologicalClasses.DBPEDIA_PLACE);
-
- organizationNameModel = buildNameModel(directoryPath, "organization",
- OntologicalClasses.DBPEDIA_ORGANISATION);
+ private ServiceRegistration dfpServiceRegistration;
+
+ @Reference
+ private DataFileProvider dataFileProvider;
+
+ protected void activate(ComponentContext ctx) throws IOException {
+ // Need our DataFileProvider before building the models
+ dfpServiceRegistration = ctx.getBundleContext().registerService(
+ DataFileProvider.class.getName(),
+ new ClasspathDataFileProvider(), null);
+
+ engineCore = new EngineCore(dataFileProvider, ctx.getBundleContext().getBundle().getSymbolicName());
}
- // @Deactivate
protected void deactivate(ComponentContext ce) {
- sentenceModel = null;
- personNameModel = null;
- locationNameModel = null;
- organizationNameModel = null;
- }
-
- protected InputStream lookupModelStream(String directoryPath, String modelRelativePath) throws IOException {
-
- ClassLoader loader = this.getClass().getClassLoader();
- if (directoryPath != null && directoryPath.length() > 0) {
- // load custom models from the provided FS directory
- File modelData = new File(new File(directoryPath), modelRelativePath);
- return new FileInputStream(modelData);
- } else {
- // load default OpenNLP models from classpath (embedded in the defaultdata bundle)
- String resourcePath = "org/apache/stanbol/defaultdata/opennlp/" + modelRelativePath;
- InputStream in = loader.getResourceAsStream(resourcePath);
- if (in == null) {
- throw new IOException("Coult not find resource from the classpath: " + resourcePath);
- }
- return in;
- }
- }
-
- protected TokenNameFinderModel buildNameModel(String directoryPath, String name, UriRef typeUri) throws IOException {
- String modelRelativePath = String.format("en-ner-%s.bin", name);
- TokenNameFinderModel model = new TokenNameFinderModel(lookupModelStream(directoryPath,
- modelRelativePath));
- // register the name finder instances for matching owl class
- entityTypes.put(name, new Object[] {typeUri, model});
- return model;
- }
-
- public void computeEnhancements(ContentItem ci) throws EngineException {
- String text;
- try {
- text = IOUtils.toString(ci.getStream(), "UTF-8");
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- if (text.trim().length() == 0) {
- // TODO: make the length of the data a field of the ContentItem
- // interface to be able to filter out empty items in the canEnhance
- // method
- log.warn("nothing to extract knowledge from");
- return;
- }
-
- try {
- for (Map.Entry<String,Object[]> type : entityTypes.entrySet()) {
- String typeLabel = type.getKey();
- Object[] typeInfo = type.getValue();
- UriRef typeUri = (UriRef) typeInfo[0];
- TokenNameFinderModel nameFinderModel = (TokenNameFinderModel) typeInfo[1];
- findNamedEntities(ci, text, typeUri, typeLabel, nameFinderModel);
- }
- } catch (Exception e) { // TODO: makes it sense to catch Exception here?
- throw new EngineException(this, ci, e);
- }
- }
-
- protected void findNamedEntities(final ContentItem ci,
- final String text,
- final UriRef typeUri,
- final String typeLabel,
- final TokenNameFinderModel nameFinderModel) {
-
- if (ci == null) {
- throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
- }
- if (text == null) {
- log.warn("NULL was parsed as text for content item " + ci.getId() + "! -> call ignored");
- return;
- }
- LiteralFactory literalFactory = LiteralFactory.getInstance();
- MGraph g = ci.getMetadata();
- Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
-
- Map<String,UriRef> previousAnnotations = new LinkedHashMap<String,UriRef>();
- for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
-
- String name = nameInContext.getKey();
- List<NameOccurrence> occurrences = nameInContext.getValue();
-
- UriRef firstOccurrenceAnnotation = null;
-
- for (NameOccurrence occurrence : occurrences) {
- UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory
- .createTypedLiteral(name)));
- g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory
- .createTypedLiteral(occurrence.context)));
- g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
- g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
- .createTypedLiteral(occurrence.confidence)));
- if (occurrence.start != null && occurrence.end != null) {
- g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory
- .createTypedLiteral(occurrence.start)));
- g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory
- .createTypedLiteral(occurrence.end)));
- }
-
- // add the subsumption relationship among occurrences of the same
- // name
- if (firstOccurrenceAnnotation == null) {
- // check already extracted annotations to find a first most
- // specific occurrence
- for (Map.Entry<String,UriRef> entry : previousAnnotations.entrySet()) {
- if (entry.getKey().contains(name)) {
- // we have found a most specific previous
- // occurrence, use it as subsumption target
- firstOccurrenceAnnotation = entry.getValue();
- g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
- break;
- }
- }
- if (firstOccurrenceAnnotation == null) {
- // no most specific previous occurrence, I am the first,
- // most specific occurrence to be later used as a target
- firstOccurrenceAnnotation = textAnnotation;
- previousAnnotations.put(name, textAnnotation);
- }
- } else {
- // I am referring to a most specific first occurrence of the
- // same name
- g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
- }
- }
+ if(dfpServiceRegistration != null) {
+ dfpServiceRegistration.unregister();
+ dfpServiceRegistration = null;
}
}
-
- public Collection<String> extractPersonNames(String text) {
- return extractNames(personNameModel, text);
- }
-
- public Collection<String> extractLocationNames(String text) {
- return extractNames(locationNameModel, text);
- }
-
- public Collection<String> extractOrganizationNames(String text) {
- return extractNames(organizationNameModel, text);
- }
-
- public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
- return extractNameOccurrences(personNameModel, text);
- }
-
- public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
- return extractNameOccurrences(locationNameModel, text);
- }
-
- public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
- return extractNameOccurrences(organizationNameModel, text);
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
+ (Object) defaultOrder));
}
- protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
- return extractNameOccurrences(nameFinderModel, text).keySet();
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ checkCore();
+ return engineCore.canEnhance(ci);
}
- protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
- String text) {
-
- // version with explicit sentence endings to reflect heading / paragraph
- // structure of an HTML or PDF document converted to text
- String textWithDots = text.replaceAll("\\n\\n", ".\n");
-
- SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
-
- Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
-
- NameFinderME finder = new NameFinderME(nameFinderModel);
-
- Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
- Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
- for (int i = 0; i < sentenceSpans.length; i++) {
- String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
-
- // build a context by concatenating three sentences to be used for
- // similarity ranking / disambiguation + contextual snippet in the
- // extraction structure
- List<String> contextElements = new ArrayList<String>();
- if (i > 0) {
- CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
- contextElements.add(previousSentence.toString().trim());
- }
- contextElements.add(sentence.toString().trim());
- if (i + 1 < sentenceSpans.length) {
- CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
- contextElements.add(nextSentence.toString().trim());
- }
- String context = StringUtils.join(contextElements, " ");
-
- // extract the names in the current sentence and
- // keep them store them with the current context
- String[] tokens = tokenizer.tokenize(sentence);
- Span[] nameSpans = finder.find(tokens);
- double[] probs = finder.probs();
- String[] names = Span.spansToStrings(nameSpans, tokens);
- int lastStartPosition = 0;
- for (int j = 0; j < names.length; j++) {
- String name = names[j];
- Double confidence = 1.0;
- for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
- confidence *= probs[k];
- }
- int start = sentence.substring(lastStartPosition).indexOf(name);
- Integer absoluteStart = null;
- Integer absoluteEnd = null;
- if (start != -1) {
- /*
- * NOTE (rw, issue 19, 20100615) Here we need to set the new start position, by adding the
- * current start to the lastStartPosion. we need also to use the lastStartPosition to
- * calculate the start of the element. The old code had not worked if names contains more
- * than a single element!
- */
- lastStartPosition += start;
- absoluteStart = sentenceSpans[i].getStart() + lastStartPosition;
- absoluteEnd = absoluteStart + name.length();
- }
- NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
- confidence);
-
- List<NameOccurrence> occurrences = nameOccurrences.get(name);
- if (occurrences == null) {
- occurrences = new ArrayList<NameOccurrence>();
- }
- occurrences.add(occurrence);
- nameOccurrences.put(name, occurrences);
- }
- }
- finder.clearAdaptiveData();
-
- if (log.isDebugEnabled()) {
- for (List<NameOccurrence> occurrences : nameOccurrences.values()) {
- log.debug("Occurrences found: " + StringUtils.join(occurrences, ", "));
- }
- }
- return nameOccurrences;
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ checkCore();
+ engineCore.computeEnhancements(ci);
}
-
- public int canEnhance(ContentItem ci) {
- // in case text/pain;charSet=UTF8 is parsed
- String mimeType = ci.getMimeType().split(";", 2)[0];
- if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
- return ENHANCE_SYNCHRONOUS;
+
+ private void checkCore() {
+ if(engineCore == null) {
+ throw new IllegalStateException("EngineCore not initialized");
}
- return CANNOT_ENHANCE;
}
-
- @Override
- public Map<String,Object> getServiceProperties() {
- return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING,
- (Object) defaultOrder));
- }
-
-}
+}
\ No newline at end of file
Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Wed Apr 6 13:40:05 2011
@@ -16,12 +16,17 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
+
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
-import java.util.Dictionary;
-import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -36,15 +41,10 @@ import org.apache.clerezza.rdf.core.UriR
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
-import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
-
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.*;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
-
public class TestNamedEntityExtractionEnhancementEngine extends Assert {
public static final String SINGLE_SENTENCE = "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -57,18 +57,11 @@ public class TestNamedEntityExtractionEn
+ " without any name.\n"
+ "A new paragraph is being written. This paragraph has two sentences.";
- static NamedEntityExtractionEnhancementEngine nerEngine = new NamedEntityExtractionEnhancementEngine();
+ static EngineCore nerEngine;
@BeforeClass
public static void setUpServices() throws IOException {
- Dictionary<String, Object> properties = new Hashtable<String, Object>();
- MockComponentContext context = new MockComponentContext(properties);
- nerEngine.activate(context);
- }
-
- @AfterClass
- public static void shutdownServices() {
- nerEngine.deactivate(null);
+ nerEngine = new EngineCore(new ClasspathDataFileProvider(), "TEST_BUNDLE_SYMBOLIC_NAME");
}
public static ContentItem wrapAsContentItem(final String id,
@@ -223,5 +216,4 @@ public class TestNamedEntityExtractionEn
assertTrue(!endPosIterator.hasNext());
}
}
-
-}
+}
\ No newline at end of file
Modified: incubator/stanbol/trunk/enhancer/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/StatelessEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/StatelessEngineTest.java?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/StatelessEngineTest.java (original)
+++ incubator/stanbol/trunk/enhancer/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/StatelessEngineTest.java Wed Apr 6 13:40:05 2011
@@ -37,7 +37,7 @@ public class StatelessEngineTest extends
"http://purl.org/dc/terms/creator.*LangIdEnhancementEngine",
"http://purl.org/dc/terms/language.*en",
"http://fise.iks-project.eu/ontology/entity-label.*Paris",
- "http://purl.org/dc/terms/creator.*NamedEntityExtractionEnhancementEngine",
+ "http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
"http://fise.iks-project.eu/ontology/entity-label.*Bob Marley"
)
.generateDocumentation(
Modified: incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/enhancer/launchers/full/src/main/bundles/list.xml Wed Apr 6 13:40:05 2011
@@ -195,6 +195,11 @@
<artifactId>org.apache.stanbol.commons.stanboltools.offline</artifactId>
<version>0.9-SNAPSHOT</version>
</bundle>
+ <bundle>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
+ <version>0.9-SNAPSHOT</version>
+ </bundle>
</startLevel>
<!-- Clerezza storage and sparql infrastructure -->
Modified: incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml?rev=1089451&r1=1089450&r2=1089451&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/enhancer/launchers/lite/src/main/bundles/list.xml Wed Apr 6 13:40:05 2011
@@ -308,6 +308,11 @@
<artifactId>org.apache.stanbol.enhancer.jersey</artifactId>
<version>0.9-SNAPSHOT</version>
</bundle>
+ <bundle>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
+ <version>0.9-SNAPSHOT</version>
+ </bundle>
</startLevel>
<!-- Stanbol Enhancer Enhancement Engines -->