You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/12/04 13:12:43 UTC

svn commit: r1547783 - /opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/

Author: markg
Date: Wed Dec  4 12:12:43 2013
New Revision: 1547783

URL: http://svn.apache.org/r1547783
Log:
OPENNLP-614
Fixed a bug in the GeoEntityLinker. No gaz lookup was being performed if no country context was found.

Modified:
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java Wed Dec  4 12:12:43 2013
@@ -18,11 +18,6 @@ package org.apache.opennlp.addons.tools.
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
-import java.sql.CallableStatement;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.ResultSet;
-import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -42,12 +37,23 @@ import opennlp.tools.entitylinker.Entity
  */
 public class CountryContext {
 
-  private Connection con;
+ 
   private List<CountryContextEntry> countrydata;
   private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
   private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
   private Set<CountryContextEntry> countryHits = new HashSet<>();
 
+  public CountryContext() {
+  }
+
+  public Map<String, Set<Integer>> getCountryMentions() {
+    return countryMentions;
+  }
+
+  public Set<CountryContextEntry> getCountryHits() {
+    return countryHits;
+  }
+
   public Map<String, Set<String>> getNameCodesMap() {
     return nameCodesMap;
   }
@@ -56,10 +62,6 @@ public class CountryContext {
     this.nameCodesMap = nameCodesMap;
   }
 
-  public CountryContext() {
-  }
-
-
   /**
    * Finds mentions of countries based on a list from MySQL stored procedure
    * called getCountryList. This method finds country mentions in documents,
@@ -71,15 +73,13 @@ public class CountryContext {
    * @return
    */
   public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
-    countryMentions = new HashMap<String, Set<Integer>>();
+    countryMentions = new HashMap<>();
     nameCodesMap.clear();
     try {
-//      if (con == null) {
-//        con = getMySqlConnection(properties);
-//      }
+
       if (countrydata == null) {
-         countrydata = getCountryContextFromFile(properties);
-     //   countrydata = getCountryData(properties);
+        countrydata = getCountryContextFromFile(properties);
+        //   countrydata = getCountryData(properties);
       }
       for (CountryContextEntry entry : countrydata) {
         Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
@@ -122,95 +122,6 @@ public class CountryContext {
     return countryMentions;
   }
 
-  /**
-   * returns a unique list of country codes
-   *
-   * @param countryMentions the countryMentions discovered
-   * @return
-   */
-  public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
-    Set<String> ccs = new HashSet<String>();
-    for (CountryContextHit hit : hits) {
-      ccs.add(hit.getCountryCode().toLowerCase());
-    }
-    return ccs;
-  }
-
-  public static String getCountryCodeCSV(Set<String> hits) {
-    String csv = "";
-    if (hits.isEmpty()) {
-      return csv;
-    }
-
-    for (String code : hits) {
-      csv += "," + code;
-    }
-    return csv.substring(1);
-  }
-
-  private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {
-
-    String driver = properties.getProperty("db.driver", "org.gjt.mm.mysql.Driver");
-    String url = properties.getProperty("db.url", "jdbc:mysql://localhost:3306/world");
-    String username = properties.getProperty("db.username", "root");
-    String password = properties.getProperty("db.password", "?");
-
-    Class.forName(driver);
-    Connection conn = DriverManager.getConnection(url, username, password);
-    return conn;
-  }
-
-  /**
-   * reads the list from the database by calling a stored procedure
-   * getCountryList
-   *
-   * @param properties
-   * @return
-   * @throws SQLException
-   */
-  private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {
-    List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
-    try {
-      if (con == null) {
-        con = getMySqlConnection(properties);
-      }
-      CallableStatement cs;
-      cs = con.prepareCall("CALL `getCountryList`()");
-      ResultSet rs;
-      rs = cs.executeQuery();
-      if (rs == null) {
-        return entries;
-      }
-      while (rs.next()) {
-        CountryContextEntry s = new CountryContextEntry();
-        //rc,cc1, full_name_nd_ro,dsg
-        s.setRc(rs.getString(1));
-        s.setCc1(rs.getString(2));
-//a.district, 
-        s.setFull_name_nd_ro(rs.getString(3));
-//b.name as countryname, 
-        s.setDsg(rs.getString(4));
-        entries.add(s);
-      }
-
-    } catch (SQLException ex) {
-      System.err.println(ex);
-    } catch (Exception e) {
-      System.err.println(e);
-    } finally {
-      con.close();
-    }
-    return entries;
-  }
-
-  public Map<String, Set<Integer>> getCountryMentions() {
-    return countryMentions;
-  }
-
-  public Set<CountryContextEntry> getCountryHits() {
-    return countryHits;
-  }
-
   private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
     List<CountryContextEntry> entries = new ArrayList<>();
     String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java Wed Dec  4 12:12:43 2013
@@ -30,7 +30,7 @@ public class CountryContextEntry {
   private String cc1;
   private String full_name_nd_ro;
   private String dsg;
-
+  private String provCode;
   public CountryContextEntry() {
   }
 
@@ -41,6 +41,14 @@ public class CountryContextEntry {
     this.dsg = dsg;
   }
 
+  public String getProvCode() {
+    return provCode;
+  }
+
+  public void setProvCode(String provCode) {
+    this.provCode = provCode;
+  }
+
   public String getRc() {
     return rc;
   }

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java Wed Dec  4 12:12:43 2013
@@ -36,7 +36,7 @@ public class CountryProximityScorer impl
   String dominantCode = "";
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
 
     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
 
@@ -134,10 +134,10 @@ public class CountryProximityScorer impl
 
     /**
      * the gaz matches that have a country code that have mentions in the doc
-     * that are closest to the Named Entity should return the best score Analyze
-     * map generates a likelihood score that the toponym from the gaz is
-     * referring to one of the countries Map<countrycode, prob that this span is
-     * referring to the toponym form this code key>
+     * that are closest to the Named Entity should return the best score.
+     * Analyzemap generates a likelihood score that the toponym from the gaz is
+     * referring to one of the countries, i.e, Map<countrycode, prob that this
+     * span is referring to the toponym form this code key>
      */
     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
     for (BaseLink link : span.getLinkedEntries()) {
@@ -148,21 +148,16 @@ public class CountryProximityScorer impl
         score = scoreMap.get(spanCountryCode);
         ///does the name extracted match a country name?
         if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
-          //if so, is it the correct country code for that name
+          //if so, is it the correct country code for that name?
           if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
             //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
             //TODO: make this multiplier configurable
-            //TODO: improve this with a geographic/geometry based clustering (linear binning to be more precise) of points returned from the gaz
             score = (score + .75) > 1.0 ? 1d : (score + .75);
-            //boost the score if the hit is from the dominant country context
 
             if (link.getItemParentID().equals(dominantCode)) {
               score = (score + .25) > 1.0 ? 1d : (score + .25);
             }
-
-
           }
-
         }
       }
       link.getScoreMap().put("countrycontext", score);
@@ -184,7 +179,7 @@ public class CountryProximityScorer impl
   private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
 
     Map<String, Double> scoreMap = new HashMap<String, Double>();
-    if(distanceMap.isEmpty()){
+    if (distanceMap.isEmpty()) {
       return scoreMap;
     }
     TreeSet<Integer> all = new TreeSet<Integer>();
@@ -195,8 +190,8 @@ public class CountryProximityScorer impl
 
     Integer min = all.first();
     Integer max = all.last();
-    if(min==max){
-      min=0;
+    if (min == max) {
+      min = 0;
     }
     for (String key : distanceMap.keySet()) {
 

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java Wed Dec  4 12:12:43 2013
@@ -72,7 +72,9 @@ public class GazateerSearcher {
       /**
        * build the search string
        */
-      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^10000";
+      String luceneQueryString = !code.equals("")
+              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
+              : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
       /**
        * check the cache and go no further if the records already exist
        */
@@ -82,7 +84,7 @@ public class GazateerSearcher {
       }
       if (geonamesIndex == null) {
         String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
         scoreCutoff = Double.valueOf(cutoff);
         geonamesIndex = new MMapDirectory(new File(indexloc));
         geonamesReader = DirectoryReader.open(geonamesIndex);

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java Wed Dec  4 12:12:43 2013
@@ -68,10 +68,15 @@ public class GeoEntityLinker implements 
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
         if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
           // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
-          for (String code : countryMentions.keySet()) {
-            if (!code.equals("us")) {
-              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+          if (!countryMentions.keySet().isEmpty()) {
+            for (String code : countryMentions.keySet()) {
+              if (!code.equals("us")) {
+                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+              }
             }
+          } else {
+            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
+
           }
 
         }