You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2011/06/09 12:54:36 UTC

svn commit: r1133787 - in /incubator/stanbol/trunk/enhancer/engines/opencalais: ./ src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/ src/main/resources/ src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/

Author: wkasper
Date: Thu Jun  9 10:54:35 2011
New Revision: 1133787

URL: http://svn.apache.org/viewvc?rev=1133787&view=rev
Log:
Stanbol-215:
added configurable type mapping for OpenCalais types
provided a default mapping to dbpedia types, presupposed by autotagging engines
added configurable option whether OpenCalais LinkedData references should be added as entity annotations (by default no)

Added:
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/   (with props)
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/calaisTypeMap.txt
Modified:
    incubator/stanbol/trunk/enhancer/engines/opencalais/README.txt
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/CalaisEntityOccurrence.java
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
    incubator/stanbol/trunk/enhancer/engines/opencalais/src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/TestOpenCalaisEngine.java

Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/README.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/README.txt?rev=1133787&r1=1133786&r2=1133787&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/README.txt (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/README.txt Thu Jun  9 10:54:35 2011
@@ -12,4 +12,15 @@ In the OSGi configuration console the ke
 
 Also, the tests require the API key. Without the key some tests will be skipped. For Maven the key can be set as a system property on the command line:
 
-	mvn -Deu.iksproject.fise.engines.opencalais.license=YOUR_API_KEY [install|test]
+	mvn -Dorg.apache.stanbol.enhancer.engines.opencalais.license=YOUR_API_KEY [install|test]
+
+
+
+Configuration properties that influence the enhancements delivered from the engine at runtime are:
+
+- org.apache.stanbol.enhancer.engines.opencalais.typeMap
+	The value is the name of a file for mapping the NER types from OpenCalais to other types. By default, a mapping to the DBPedia types is provided. If no mapping is desired one might pass an empty mapping file.
+	
+- org.apache.stanbol.enhancer.engines.opencalais.NERonly
+	A boolean property to specify whether in addition to the NER enhancements also the OpenCalais Linked Data references are included as entity references. By default, these are omitted.
+

Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/CalaisEntityOccurrence.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/CalaisEntityOccurrence.java?rev=1133787&r1=1133786&r2=1133787&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/CalaisEntityOccurrence.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/CalaisEntityOccurrence.java Thu Jun  9 10:54:35 2011
@@ -27,10 +27,10 @@ public class CalaisEntityOccurrence {
 
     public Resource id;
     public Resource type;
-    public Resource name;
+    public String name;
     public Integer offset;
     public Integer length;
-    public Resource exact;
+    public String exact;
     public String context;
     public Double relevance = -1.0;
 

Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java?rev=1133787&r1=1133786&r2=1133787&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java Thu Jun  9 10:54:35 2011
@@ -16,9 +16,22 @@
  */
 package org.apache.stanbol.enhancer.engines.opencalais.impl;
 
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+
+import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
 import java.io.UnsupportedEncodingException;
 import java.net.HttpURLConnection;
@@ -47,6 +60,7 @@ import org.apache.clerezza.rdf.core.acce
 import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.clerezza.rdf.core.serializedform.Parser;
+import org.apache.clerezza.rdf.core.serializedform.Serializer;
 import org.apache.clerezza.rdf.core.sparql.ParseException;
 import org.apache.clerezza.rdf.core.sparql.QueryParser;
 import org.apache.clerezza.rdf.core.sparql.ResultSet;
@@ -63,15 +77,11 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
-import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.osgi.framework.BundleContext;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import static org.apache.stanbol.enhancer.servicesapi.ServiceProperties.ENHANCEMENT_ENGINE_ORDERING;
-import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.*;
-
 
 /**
  * This class provides an interface to the OpenCalais service for Named Entity Recognition.
@@ -112,6 +122,12 @@ public class OpenCalaisEngine implements
 
     @Property(value = "http://api.opencalais.com/enlighten/rest/")
     public static final String CALAIS_URL_KEY = "org.apache.stanbol.enhancer.engines.opencalais.url";
+    
+    @Property
+    public static final String CALAIS_TYPE_MAP_KEY = "org.apache.stanbol.enhancer.engines.opencalais.typeMap";
+    
+    @Property(value="true")
+    public static final String CALAIS_NER_ONLY_MODE_KEY = "org.apache.stanbol.enhancer.engines.opencalais.NERonly";
 
     /**
      * the URL for the Calais REST Service
@@ -122,12 +138,29 @@ public class OpenCalaisEngine implements
      * the license key from OpenCalais for using the service
      */
     private String licenseKey = null;
+    
+    private String calaisTypeMapFile = null;
+
+    /**
+     * specify whether only the NER results from OpenCalais should be used. Entity references from OpenCalais will be omitted. This mode is intended to be used with another entity tagging engine.
+     */
+    private boolean onlyNERMode;
 
     @Reference
     TcManager tcManager;
 
     BundleContext bundleContext;
 
+    /**
+     * a map for mapping Calais classes to other classes (e.g. from dbpedia)
+     */
+    private Map<UriRef,UriRef> calaisTypeMap;
+    
+    /**
+     * the default file containing type mappings. Key and value are separated by the regular expression ' ?= ?'.
+     */
+    private static final String CALAIS_TYPE_MAP_DEFAULT ="calaisTypeMap.txt";
+
     public String getLicenseKey() {
         return licenseKey;
     }
@@ -144,12 +177,48 @@ public class OpenCalaisEngine implements
         this.calaisUrl = calaisUrl;
     }
 
+    public Map<UriRef,UriRef> getCalaisTypeMap() {
+      return calaisTypeMap;
+    }
+
+    public void setCalaisTypeMap(Map<UriRef,UriRef> calaisTypeMap) {
+      this.calaisTypeMap = calaisTypeMap;
+    }
+
     public Map<String, Object> getServiceProperties() {
         // TODO Auto-generated method stub
         return Collections.unmodifiableMap(Collections.singletonMap(
                 ENHANCEMENT_ENGINE_ORDERING,
                 (Object) defaultOrder));
     }
+    
+    protected void loadTypeMap(String resource) {
+      InputStream in = null;
+      BufferedReader reader = null;
+      try {
+      if (resource == null || resource.trim().length()>0) {
+        in = new FileInputStream(resource);
+      }
+      else {
+        in = getClass().getClassLoader().getResourceAsStream(CALAIS_TYPE_MAP_DEFAULT);
+      }
+      reader = new BufferedReader(new InputStreamReader(in));
+      String line;
+        while ((line = (reader.readLine())) != null) {
+          if (line.startsWith("#"))
+            continue;
+          String[] entry = line.split("\\s*=\\s*");
+          if (entry.length == 2) {
+            calaisTypeMap.put(new UriRef(entry[0]), new UriRef(entry[1]));
+          }
+        }
+        reader.close();
+        in.close();
+      }
+      catch (IOException e) {
+        log.error("Error in reading type map file: {}", e.getMessage());
+      }
+    }
 
     public int canEnhance(ContentItem ci) throws EngineException {
         if (getLicenseKey() == null || getLicenseKey().trim().length() == 0) {
@@ -178,14 +247,16 @@ public class OpenCalaisEngine implements
     }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
+        String mimeType = ci.getMimeType().split(";", 2)[0].toLowerCase();
         String text = "";
-        if (SUPPORTED_MIMETYPES.contains(ci.getMimeType().split(";", 2)[0].toLowerCase())) {
+        if (SUPPORTED_MIMETYPES.contains(mimeType)) {
             try {
                 text = IOUtils.toString(ci.getStream(),"UTF-8");
             } catch (IOException e) {
                 throw new InvalidContentException(this, ci, e);
             }
         } else {
+            mimeType = "text/plain";
             text = getMetadataText(ci.getMetadata(), new UriRef(ci.getId()));
         }
         if (text == null) {
@@ -193,9 +264,19 @@ public class OpenCalaisEngine implements
             return;
         }
 
-        MGraph calaisModel = getCalaisAnalysis(text, ci);
+        MGraph calaisModel = getCalaisAnalysis(text, mimeType);
         if (calaisModel != null) {
             createEnhancements(queryModel(calaisModel), ci);
+            if (log.isDebugEnabled()) {
+              Serializer serializer = Serializer.getInstance();
+              ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
+              serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
+              try {
+                log.debug("Calais Enhancements:\n{}",debugStream.toString("UTF-8"));
+              } catch (UnsupportedEncodingException e) {
+                e.printStackTrace();
+              }
+            }
         }
 
     }
@@ -218,21 +299,33 @@ public class OpenCalaisEngine implements
                     ci, this);
             MGraph model = ci.getMetadata();
             model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
-            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, occ.exact));
+            // for autotagger use the name instead of the matched term (that might be a pronoun!)
+            if (onlyNERMode) {
+                model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,literalFactory.createTypedLiteral(occ.name)));
+            }
+            else {
+                model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory.createTypedLiteral(occ.exact)));
+            }
             model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
             model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.length)));
-            model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, occ.exact));
             model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory.createTypedLiteral(occ.context)));
             //create EntityAnnotation only once but add a reference to the textAnnotation
             if (entityAnnotationMap.containsKey(occ.id)) {
                 model.add(new TripleImpl(entityAnnotationMap.get(occ.id), DC_RELATION, textAnnotation));
             } else {
-                UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
-                entityAnnotationMap.put(occ.id, entityAnnotation);
-                model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
-                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
-                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
-                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
+                if (onlyNERMode) {
+              // don't create Calais specific entity annotations; let the autotagger do its's own
+              // but add a pointer to the first text annotation with that name
+                entityAnnotationMap.put(occ.id,textAnnotation);
+                }
+                else {
+//                UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
+//                entityAnnotationMap.put(occ.id, entityAnnotation);
+//                model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
+//                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
+//                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
+//                model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
+                }
             }
         }
     }
@@ -246,8 +339,7 @@ public class OpenCalaisEngine implements
      *
      * @throws EngineException
      */
-    public MGraph getCalaisAnalysis(String text, ContentItem ci) throws EngineException {
-        String mimeType = ci.getMimeType().split(";", 2)[0].toLowerCase();
+    public MGraph getCalaisAnalysis(String text, String mimeType) throws EngineException {
         if (mimeType.equals("text/plain")) {
             mimeType = "text/raw";
         }
@@ -272,10 +364,13 @@ public class OpenCalaisEngine implements
                     .append("&paramsXML=")
                     .append(URLEncoder.encode(calaisParams, "UTF-8"));
             // get annotations from Calais
+            log.info("Calais request sent");
             String calaisResult =
                     doPostRequest(
                             this.getCalaisUrl(), null, postParams.toString(),
                             "application/x-www-form-urlencoded", "UTF-8");
+            log.info("Calais response received: {}",calaisResult.length());
+            log.info("Calais response:\n {}",calaisResult);
             log.debug("Calais data:\n{}", calaisResult);
             // build model from Calais result
             InputStream in = new ByteArrayInputStream(calaisResult.getBytes("utf-8"));
@@ -356,9 +451,20 @@ public class OpenCalaisEngine implements
                 CalaisEntityOccurrence occ = new CalaisEntityOccurrence();
                 Resource disambiguated = row.get("did");
                 occ.id = (disambiguated == null ? row.get("id") : disambiguated);
-                occ.type = (disambiguated == null ? row.get("type") : row.get("dtype"));
-                occ.name = row.get("name");
-                occ.exact = row.get("exact");
+                if (onlyNERMode) {
+                    occ.type = row.get("type");
+                }
+                else {
+                    occ.type = (disambiguated == null ? row.get("type") : row.get("dtype"));
+                }
+                if (calaisTypeMap != null) {
+                    UriRef mappedType = calaisTypeMap.get(occ.type);
+                    if (mappedType != null) {
+                        occ.type = mappedType;
+                    }
+                }
+                occ.name = ((Literal)row.get("name")).getLexicalForm();
+                occ.exact = ((Literal)row.get("exact")).getLexicalForm();
                 //TODO for html the offsets might not be those of the original document but refer to a cleaned up version?
                 occ.offset = Integer.valueOf(((Literal) row.get("offset")).getLexicalForm());
                 // remove brackets
@@ -373,6 +479,7 @@ public class OpenCalaisEngine implements
             // TODO Auto-generated catch block
             e.printStackTrace();
         }
+        log.info("Found {} occurences", result.size());
         return result;
     }
 
@@ -494,8 +601,13 @@ public class OpenCalaisEngine implements
             Dictionary<String, String> properties = ce.getProperties();
             String license = properties.get(LICENSE_KEY);
             String url = properties.get(CALAIS_URL_KEY);
+            calaisTypeMapFile = properties.get(CALAIS_TYPE_MAP_KEY);
+            String standAlone = properties.get(CALAIS_NER_ONLY_MODE_KEY);
             setLicenseKey(license);
             setCalaisUrl(url);
+            calaisTypeMap = new HashMap<UriRef,UriRef>();
+            loadTypeMap(calaisTypeMapFile);
+            onlyNERMode = Boolean.parseBoolean(standAlone);
             //      this.tcManager = TcManager.getInstance();
         }
     }

Propchange: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/
------------------------------------------------------------------------------
    bugtraq:number = true

Added: incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/calaisTypeMap.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/calaisTypeMap.txt?rev=1133787&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/calaisTypeMap.txt (added)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/resources/calaisTypeMap.txt Thu Jun  9 10:54:35 2011
@@ -0,0 +1,10 @@
+http://s.opencalais.com/1/type/em/e/Person=http://dbpedia.org/ontology/Person
+http://s.opencalais.com/1/type/em/e/City=http://dbpedia.org/ontology/Place
+http://s.opencalais.com/1/type/em/e/Continent=http://dbpedia.org/ontology/Place
+http://s.opencalais.com/1/type/em/e/Country=http://dbpedia.org/ontology/Place
+http://s.opencalais.com/1/type/em/e/ProvinceOrState=http://dbpedia.org/ontology/Place
+http://s.opencalais.com/1/type/em/e/Region=http://dbpedia.org/ontology/Place
+http://s.opencalais.com/1/type/em/e/Company=http://dbpedia.org/ontology/Organisation
+http://s.opencalais.com/1/type/em/e/Facility=http://dbpedia.org/ontology/Organisation
+http://s.opencalais.com/1/type/em/e/rganization=http://dbpedia.org/ontology/Organisation
+

Modified: incubator/stanbol/trunk/enhancer/engines/opencalais/src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/TestOpenCalaisEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/TestOpenCalaisEngine.java?rev=1133787&r1=1133786&r2=1133787&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opencalais/src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/TestOpenCalaisEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opencalais/src/test/java/org/apache/stanbol/enhancer/engines/opencalais/impl/TestOpenCalaisEngine.java Thu Jun  9 10:54:35 2011
@@ -19,6 +19,7 @@ package org.apache.stanbol.enhancer.engi
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.util.Collection;
+import java.util.HashMap;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
@@ -58,6 +59,7 @@ public class TestOpenCalaisEngine {
   @BeforeClass
   public static void oneTimeSetup() {
     calaisExtractor = new OpenCalaisEngine();
+    calaisExtractor.setCalaisTypeMap(new HashMap<UriRef,UriRef>());
     calaisExtractor.tcManager = TcManager.getInstance();
     if (TEST_LICENSE_KEY != null && TEST_LICENSE_KEY.matches("\\w+")) {
       calaisExtractor.setLicenseKey(TEST_LICENSE_KEY);