You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/06/27 00:14:05 UTC

svn commit: r1139954 - /incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java

Author: rwesten
Date: Sun Jun 26 22:14:05 2011
New Revision: 1139954

URL: http://svn.apache.org/viewvc?rev=1139954&view=rev
Log:
The NEREngineCore does no longer cache models, but uses the now the model cache of the OpenNLP component.
This solves issues in cases where model data are not yet available at the time this engine is created.
This might be the case if the bundle with this engine is started before the bundle providing the model data (or a user adds the missing data to the datafiles directory).

Modified:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1139954&r1=1139953&r2=1139954&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Sun Jun 26 22:14:05 2011
@@ -26,7 +26,6 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collection;
@@ -52,7 +51,6 @@ import org.apache.clerezza.rdf.core.UriR
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringUtils;
-import org.apache.felix.scr.annotations.Reference;
 import org.apache.stanbol.commons.opennlp.OpenNLP;
 import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
@@ -72,11 +70,17 @@ public class NEREngineCore implements En
 
     private final Logger log = LoggerFactory.getLogger(getClass());
 //    private final String bundleSymbolicName;
-    protected final SentenceModel sentenceModel;
-    protected final TokenNameFinderModel personNameModel;
-    protected final TokenNameFinderModel locationNameModel;
-    protected final TokenNameFinderModel organizationNameModel;
-    protected Map<String,Object[]> entityTypes = new HashMap<String,Object[]>();
+//    protected final SentenceModel sentenceModel;
+//    protected final TokenNameFinderModel personNameModel;
+//    protected final TokenNameFinderModel locationNameModel;
+//    protected final TokenNameFinderModel organizationNameModel;
+//    protected Map<String,Object[]> entityTypes = new HashMap<String,Object[]>();
+    private static Map<String,UriRef> entityTypes = new HashMap<String,UriRef>();
+    static {
+        entityTypes.put("person", OntologicalClasses.DBPEDIA_PERSON);
+        entityTypes.put("location", OntologicalClasses.DBPEDIA_PLACE);
+        entityTypes.put("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+    }
     
     private OpenNLP openNLP;
     
@@ -89,10 +93,10 @@ public class NEREngineCore implements En
 
     public NEREngineCore(OpenNLP openNLP) throws InvalidFormatException, IOException{
         this.openNLP = openNLP;
-        sentenceModel = openNLP.buildSentenceModel("en");
-        personNameModel = buildNameModel("person", OntologicalClasses.DBPEDIA_PERSON);
-        locationNameModel = buildNameModel("location", OntologicalClasses.DBPEDIA_PLACE);
-        organizationNameModel = buildNameModel("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
+//        sentenceModel = openNLP.buildSentenceModel("en");
+//        personNameModel = buildNameModel("person", OntologicalClasses.DBPEDIA_PERSON);
+//        locationNameModel = buildNameModel("location", OntologicalClasses.DBPEDIA_PLACE);
+//        organizationNameModel = buildNameModel("organization", OntologicalClasses.DBPEDIA_ORGANISATION);
     }
     
     NEREngineCore(DataFileProvider dfp) throws InvalidFormatException, IOException {
@@ -103,7 +107,7 @@ public class NEREngineCore implements En
         //String modelRelativePath = String.format("en-ner-%s.bin", name);
         TokenNameFinderModel model = openNLP.buildNameModel(name, "en");
         // register the name finder instances for matching owl class
-        entityTypes.put(name, new Object[] {typeUri, model});
+//        entityTypes.put(name, new Object[] {typeUri, model});
         return model;
     }
 
@@ -132,11 +136,10 @@ public class NEREngineCore implements En
         log.debug("computeEnhancements {} text={}", ci.getId(), StringUtils.abbreviate(text, 100));
 
         try {
-            for (Map.Entry<String,Object[]> type : entityTypes.entrySet()) {
+            for (Map.Entry<String,UriRef> type : entityTypes.entrySet()) {
                 String typeLabel = type.getKey();
-                Object[] typeInfo = type.getValue();
-                UriRef typeUri = (UriRef) typeInfo[0];
-                TokenNameFinderModel nameFinderModel = (TokenNameFinderModel) typeInfo[1];
+                UriRef typeUri = type.getValue();
+                TokenNameFinderModel nameFinderModel = openNLP.buildNameModel(typeLabel, "en");
                 findNamedEntities(ci, text, typeUri, typeLabel, nameFinderModel);
             }
         } catch (Exception e) {
@@ -217,33 +220,84 @@ public class NEREngineCore implements En
     }
 
     public Collection<String> extractPersonNames(String text) {
-        return extractNames(personNameModel, text);
+        return extractNames(getNameModel("person","en"),text);
     }
 
     public Collection<String> extractLocationNames(String text) {
-        return extractNames(locationNameModel, text);
+        return extractNames(getNameModel("location","en"), text);
     }
 
     public Collection<String> extractOrganizationNames(String text) {
-        return extractNames(organizationNameModel, text);
+        return extractNames(getNameModel("organization","en"), text);
     }
 
     public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
-        return extractNameOccurrences(personNameModel, text);
+        return extractNameOccurrences(getNameModel("person","en"), text);
     }
 
     public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
-        return extractNameOccurrences(locationNameModel, text);
+        return extractNameOccurrences(getNameModel("location","en"), text);
     }
 
     public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
-        return extractNameOccurrences(organizationNameModel, text);
+        return extractNameOccurrences(getNameModel("organization","en"), text);
     }
 
     protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
         return extractNameOccurrences(nameFinderModel, text).keySet();
     }
 
+    /**
+     * Gets/builds a TokenNameFinderModel by using {@link #openNLP} and throws
+     * {@link IllegalStateException}s in case the model could not be built or
+     * the data for the model where not found.
+     * @param the type of the named finder model
+     * @param language the language for the model
+     * @return the model or an {@link IllegalStateException} if not available
+     */
+    private TokenNameFinderModel getNameModel(String type,String language) {
+        try {
+            TokenNameFinderModel model = openNLP.buildNameModel(type, language);
+            if(model != null){
+                return model;
+            } else {
+                throw new IllegalStateException(String.format(
+                    "Unable to built Model for extracting %s from '%s' language " +
+                    "texts because the model data could not be loaded.",
+                    type,language));
+            }
+        } catch (InvalidFormatException e) {
+            throw new IllegalStateException(String.format(
+                "Unable to built Model for extracting %s from '%s' language texts.",
+                type,language),e);
+        } catch (IOException e) {
+            throw new IllegalStateException(String.format(
+                "Unable to built Model for extracting %s from '%s' language texts.",
+                type,language),e);
+        }
+    }
+    private SentenceModel getSentenceModel(String language) {
+        try {
+            SentenceModel model = openNLP.buildSentenceModel(language);
+            if(model != null){
+                return model;
+            } else {
+                throw new IllegalStateException(String.format(
+                    "Unable to built Model for extracting sentences from '%s' " +
+                    "language texts because the model data could not be loaded.",
+                    language));
+            }
+        } catch (InvalidFormatException e) {
+            throw new IllegalStateException(String.format(
+                "Unable to built Model for extracting sentences from '%s' language texts.",
+                language),e);
+        } catch (IOException e) {
+            throw new IllegalStateException(String.format(
+                "Unable to built Model for extracting sentences from '%s' language texts.",
+                language),e);
+        }
+    }
+    
     protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
                                                                       String text) {
 
@@ -252,7 +306,7 @@ public class NEREngineCore implements En
         String textWithDots = text.replaceAll("\\n\\n", ".\n");
         text = removeNonUtf8CompliantCharacters(text);
 
-        SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
+        SentenceDetectorME sentenceDetector = new SentenceDetectorME(getSentenceModel("en"));
 
         Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);