You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2011/04/08 11:20:23 UTC
svn commit: r1090165 - in /incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src: main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/

Author: ogrisel
Date: Fri Apr  8 09:20:23 2011
New Revision: 1090165

URL: http://svn.apache.org/viewvc?rev=1090165&view=rev
Log:
better name for the Named Entity engine core

Added:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
Removed:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/EngineCore.java
Modified:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java

Added: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1090165&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Fri Apr  8 09:20:23 2011
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.opennlp.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.Span;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Core of our EnhancementEngine, separated from the OSGi service to make it easier to test this.
+ */
+public class NEREngineCore implements EnhancementEngine {
+    protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+    private final Logger log = LoggerFactory.getLogger(getClass());
+    private final DataFileProvider dataFileProvider;
+    private final String bundleSymbolicName;
+    protected final SentenceModel sentenceModel;
+    protected final TokenNameFinderModel personNameModel;
+    protected final TokenNameFinderModel locationNameModel;
+    protected final TokenNameFinderModel organizationNameModel;
+    protected Map<String,Object[]> entityTypes = new HashMap<String,Object[]>();
+
+    /** Comments about our models */
+    public static final Map<String, String> DATA_FILE_COMMENTS;
+    static {
+        DATA_FILE_COMMENTS = new HashMap<String, String>();
+        DATA_FILE_COMMENTS.put("Default data files", "provided by the org.apache.stanbol.defaultdata bundle");
+    }
+
+    NEREngineCore(DataFileProvider dfp, String bundleSymbolicName) throws InvalidFormatException, IOException {
+        dataFileProvider = dfp;
+        this.bundleSymbolicName = bundleSymbolicName;
+        sentenceModel = new SentenceModel(lookupModelStream("en-sent.bin"));
+        personNameModel = buildNameModel("person", OntologicalClasses.DBPEDIA_PERSON);
+        locationNameModel = buildNameModel("location", OntologicalClasses.DBPEDIA_PLACE);
+        organizationNameModel = buildNameModel("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+    }
+
+    protected InputStream lookupModelStream(String modelRelativePath) throws IOException {
+        return dataFileProvider.getInputStream(bundleSymbolicName, modelRelativePath, DATA_FILE_COMMENTS);
+    }
+
+    protected TokenNameFinderModel buildNameModel(String name, UriRef typeUri) throws IOException {
+        String modelRelativePath = String.format("en-ner-%s.bin", name);
+        TokenNameFinderModel model = new TokenNameFinderModel(lookupModelStream(modelRelativePath));
+        // register the name finder instances for matching owl class
+        entityTypes.put(name, new Object[] {typeUri, model});
+        return model;
+    }
+
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        String text;
+        try {
+            text = IOUtils.toString(ci.getStream(), "UTF-8");
+        } catch (IOException e) {
+            throw new InvalidContentException(this, ci, e);
+        }
+        if (text.trim().length() == 0) {
+            // TODO: make the length of the data a field of the ContentItem
+            // interface to be able to filter out empty items in the canEnhance
+            // method
+            log.warn("nothing to extract knowledge from in ContentItem {}", ci);
+            return;
+        }
+        log.debug("computeEnhancements {} text={}", ci.getId(), StringUtils.abbreviate(text, 100));
+
+        try {
+            for (Map.Entry<String,Object[]> type : entityTypes.entrySet()) {
+                String typeLabel = type.getKey();
+                Object[] typeInfo = type.getValue();
+                UriRef typeUri = (UriRef) typeInfo[0];
+                TokenNameFinderModel nameFinderModel = (TokenNameFinderModel) typeInfo[1];
+                findNamedEntities(ci, text, typeUri, typeLabel, nameFinderModel);
+            }
+        } catch (Exception e) {
+            throw new EngineException(this, ci, e);
+        }
+    }
+
+    protected void findNamedEntities(final ContentItem ci,
+                                     final String text,
+                                     final UriRef typeUri,
+                                     final String typeLabel,
+                                     final TokenNameFinderModel nameFinderModel) {
+
+        if (ci == null) {
+            throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
+        }
+        if (text == null) {
+            log.warn("NULL was parsed as text for content item " + ci.getId() + "! -> call ignored");
+            return;
+        }
+        log.debug("findNamedEntities typeUri={}, type={}, text=", 
+                new Object[]{ typeUri, typeLabel, StringUtils.abbreviate(text, 100) });
+        LiteralFactory literalFactory = LiteralFactory.getInstance();
+        MGraph g = ci.getMetadata();
+        Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
+
+        Map<String,UriRef> previousAnnotations = new LinkedHashMap<String,UriRef>();
+        for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
+
+            String name = nameInContext.getKey();
+            List<NameOccurrence> occurrences = nameInContext.getValue();
+
+            UriRef firstOccurrenceAnnotation = null;
+
+            for (NameOccurrence occurrence : occurrences) {
+                UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory
+                        .createTypedLiteral(name)));
+                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory
+                        .createTypedLiteral(occurrence.context)));
+                g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
+                g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory
+                        .createTypedLiteral(occurrence.confidence)));
+                if (occurrence.start != null && occurrence.end != null) {
+                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory
+                            .createTypedLiteral(occurrence.start)));
+                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory
+                            .createTypedLiteral(occurrence.end)));
+                }
+
+                // add the subsumption relationship among occurrences of the same
+                // name
+                if (firstOccurrenceAnnotation == null) {
+                    // check already extracted annotations to find a first most
+                    // specific occurrence
+                    for (Map.Entry<String,UriRef> entry : previousAnnotations.entrySet()) {
+                        if (entry.getKey().contains(name)) {
+                            // we have found a most specific previous
+                            // occurrence, use it as subsumption target
+                            firstOccurrenceAnnotation = entry.getValue();
+                            g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
+                            break;
+                        }
+                    }
+                    if (firstOccurrenceAnnotation == null) {
+                        // no most specific previous occurrence, I am the first,
+                        // most specific occurrence to be later used as a target
+                        firstOccurrenceAnnotation = textAnnotation;
+                        previousAnnotations.put(name, textAnnotation);
+                    }
+                } else {
+                    // I am referring to a most specific first occurrence of the
+                    // same name
+                    g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
+                }
+            }
+        }
+    }
+
+    public Collection<String> extractPersonNames(String text) {
+        return extractNames(personNameModel, text);
+    }
+
+    public Collection<String> extractLocationNames(String text) {
+        return extractNames(locationNameModel, text);
+    }
+
+    public Collection<String> extractOrganizationNames(String text) {
+        return extractNames(organizationNameModel, text);
+    }
+
+    public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
+        return extractNameOccurrences(personNameModel, text);
+    }
+
+    public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
+        return extractNameOccurrences(locationNameModel, text);
+    }
+
+    public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
+        return extractNameOccurrences(organizationNameModel, text);
+    }
+
+    protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
+        return extractNameOccurrences(nameFinderModel, text).keySet();
+    }
+
+    protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
+                                                                      String text) {
+
+        // version with explicit sentence endings to reflect heading / paragraph
+        // structure of an HTML or PDF document converted to text
+        String textWithDots = text.replaceAll("\\n\\n", ".\n");
+
+        SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
+
+        Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
+
+        NameFinderME finder = new NameFinderME(nameFinderModel);
+        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
+        Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
+        for (int i = 0; i < sentenceSpans.length; i++) {
+            String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
+
+            // build a context by concatenating three sentences to be used for
+            // similarity ranking / disambiguation + contextual snippet in the
+            // extraction structure
+            List<String> contextElements = new ArrayList<String>();
+            if (i > 0) {
+                CharSequence previousSentence = sentenceSpans[i - 1].getCoveredText(text);
+                contextElements.add(previousSentence.toString().trim());
+            }
+            contextElements.add(sentence.toString().trim());
+            if (i + 1 < sentenceSpans.length) {
+                CharSequence nextSentence = sentenceSpans[i + 1].getCoveredText(text);
+                contextElements.add(nextSentence.toString().trim());
+            }
+            String context = StringUtils.join(contextElements, " ");
+
+            // extract the names in the current sentence and
+            // keep them store them with the current context
+            String[] tokens = tokenizer.tokenize(sentence);
+            Span[] nameSpans = finder.find(tokens);
+            double[] probs = finder.probs();
+            String[] names = Span.spansToStrings(nameSpans, tokens);
+            int lastStartPosition = 0;
+            for (int j = 0; j < names.length; j++) {
+                String name = names[j];
+                Double confidence = 1.0;
+                for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
+                    confidence *= probs[k];
+                }
+                int start = sentence.substring(lastStartPosition).indexOf(name);
+                Integer absoluteStart = null;
+                Integer absoluteEnd = null;
+                if (start != -1) {
+                    /*
+                     * NOTE (rw, issue 19, 20100615) Here we need to set the new start position, by adding the
+                     * current start to the lastStartPosion. we need also to use the lastStartPosition to
+                     * calculate the start of the element. The old code had not worked if names contains more
+                     * than a single element!
+                     */
+                    lastStartPosition += start;
+                    absoluteStart = sentenceSpans[i].getStart() + lastStartPosition;
+                    absoluteEnd = absoluteStart + name.length();
+                }
+                NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd, context,
+                        confidence);
+
+                List<NameOccurrence> occurrences = nameOccurrences.get(name);
+                if (occurrences == null) {
+                    occurrences = new ArrayList<NameOccurrence>();
+                }
+                occurrences.add(occurrence);
+                nameOccurrences.put(name, occurrences);
+            }
+        }
+        finder.clearAdaptiveData();
+        log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
+        return nameOccurrences;
+    }
+
+    public int canEnhance(ContentItem ci) {
+        // in case text/pain;charSet=UTF8 is parsed
+        String mimeType = ci.getMimeType().split(";", 2)[0];
+        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
+            return ENHANCE_SYNCHRONOUS;
+        }
+        return CANNOT_ENHANCE;
+    }
+}

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java?rev=1090165&r1=1090164&r2=1090165&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NamedEntityExtractionEnhancementEngine.java Fri Apr  8 09:20:23 2011
@@ -58,7 +58,7 @@ public class NamedEntityExtractionEnhanc
                 DataFileProvider.class.getName(), 
                 new ClasspathDataFileProvider(ctx.getBundleContext().getBundle().getSymbolicName()), null);
         
-        engineCore = new EngineCore(dataFileProvider, ctx.getBundleContext().getBundle().getSymbolicName());
+        engineCore = new NEREngineCore(dataFileProvider, ctx.getBundleContext().getBundle().getSymbolicName());
     }
 
     protected void deactivate(ComponentContext ce) {

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1090165&r1=1090164&r2=1090165&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Fri Apr  8 09:20:23 2011
@@ -57,13 +57,13 @@ public class TestNamedEntityExtractionEn
             + " without any name.\n"
             + "A new paragraph is being written. This paragraph has two sentences.";
 
-    static EngineCore nerEngine;
+    static NEREngineCore nerEngine;
     
     public static final String FAKE_BUNDLE_SYMBOLIC_NAME = "FAKE_BUNDLE_SYMBOLIC_NAME";
 
     @BeforeClass
     public static void setUpServices() throws IOException {
-        nerEngine = new EngineCore(new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME), FAKE_BUNDLE_SYMBOLIC_NAME);
+        nerEngine = new NEREngineCore(new ClasspathDataFileProvider(FAKE_BUNDLE_SYMBOLIC_NAME), FAKE_BUNDLE_SYMBOLIC_NAME);
     }
 
     public static ContentItem wrapAsContentItem(final String id,