You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2015/02/02 23:07:45 UTC

svn commit: r1656591 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: ./ indexing/ scoring/

Author: markg
Date: Mon Feb  2 22:07:45 2015
New Revision: 1656591

URL: http://svn.apache.org/r1656591
Log:
OPENNLP-756
Many small changes in a few classes due to the REGEX support in the country context file. The country context file is now capable of regex. A bug was also fixed in the AdminBoundaryContextGenerator which improved the performance of the ProvinceProximityScorer.

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java Mon Feb  2 22:07:45 2015
@@ -74,14 +74,14 @@ public class AdminBoundaryContextGenerat
       GeoEntityLinker linker = new GeoEntityLinker();
       linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
 
-      countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");
-
+      AdminBoundaryContext c = countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a nice place. Eastern Africa people are cool.");
+      System.out.println(c);
     } catch (Exception ex) {
       java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
     }
   }
 
-  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException{
+  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws IOException {
     this.properties = properties;
     if (countrydata == null) {
       String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
@@ -155,12 +155,12 @@ public class AdminBoundaryContextGenerat
         for (String cc : countryhitMap.keySet()) {
           Map<String, String> provsForCc = provMap.get(cc);
           if (provsForCc != null) {
-            provMentions = regexfind(text, provsForCc, provHits);
+            provMentions.putAll(regexfind(text, provsForCc, provHits));
             if (provMentions != null) {
               for (String prov : provMentions.keySet()) {
                 Map<String, String> get = countyMap.get(prov);
                 if (get != null) {
-                  countyMentions = regexfind(text, get, countyHits);
+                  countyMentions.putAll(regexfind(text, get, countyHits));
                 }
               }
             }
@@ -208,64 +208,6 @@ public class AdminBoundaryContextGenerat
     return null;
   }
 
-  /**
-   * Finds mentions of countries to assist in toponym resolution. Countries are
-   * discovered via regex based on a configured file called
-   * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
-   * the entitylinker.properties file as such:
-   * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
-   *
-   * Finding mentions in documents is very helpful for scoring. Lazily loads the
-   * list from the file.
-   *
-   * @param docText the full text of the document
-   * @return
-   */
-  @Deprecated
-  public Map<String, Set<Integer>> regexfind(String docText) {
-    countryMentions = new HashMap<>();
-    nameCodesMap.clear();
-    try {
-
-      for (CountryContextEntry entry : countrydata) {
-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
-        Matcher rs = regex.matcher(docText);
-        String code = entry.getCc1().toLowerCase();
-
-        boolean found = false;
-        while (rs.find()) {
-          found = true;
-          Integer start = rs.start();
-          String hit = rs.group().toLowerCase();
-          if (countryMentions.containsKey(code)) {
-            countryMentions.get(code).add(start);
-          } else {
-            Set<Integer> newset = new HashSet<Integer>();
-            newset.add(start);
-            countryMentions.put(code, newset);
-          }
-          if (!hit.equals("")) {
-            if (this.nameCodesMap.containsKey(hit)) {
-              nameCodesMap.get(hit).add(code);
-            } else {
-              HashSet<String> newset = new HashSet<String>();
-              newset.add(code);
-              nameCodesMap.put(hit, newset);
-            }
-          }
-        }
-        if (found) {
-          countryHits.add(entry);
-        }
-
-      }
-
-    } catch (Exception ex) {
-      LOGGER.error(ex);
-    }
-
-    return countryMentions;
-  }
 
   /**
    * discovers indicators of admin boundary data using regex.
@@ -292,7 +234,7 @@ public class AdminBoundaryContextGenerat
         Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
         Matcher rs = regex.matcher(docText);
         String code = entry.toLowerCase();
-
+        code = code.trim().replace("", "");
         boolean found = false;
         while (rs.find()) {
           found = true;
@@ -349,7 +291,7 @@ public class AdminBoundaryContextGenerat
         }
         if (values.length == 6) {
           AdminBoundary entry = new AdminBoundary(
-                  values[0].toLowerCase().trim(),
+                  values[0].toLowerCase().trim().replace("", ""),
                   values[3].toLowerCase().trim(),
                   values[1].toLowerCase().trim(),
                   values[4].toLowerCase().trim(),
@@ -358,7 +300,7 @@ public class AdminBoundaryContextGenerat
           entries.add(entry);
         } else {
           AdminBoundary entry = new AdminBoundary(
-                  values[0].toLowerCase().trim(),
+                values[0].toLowerCase().trim().replace("", ""),
                   values[3].toLowerCase().trim(),
                   values[1].toLowerCase().trim(),
                   values[4].toLowerCase().trim(),

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Mon Feb  2 22:07:45 2015
@@ -76,7 +76,7 @@ public class GazetteerSearcher {
     }
   }
 
-  public GazetteerSearcher(EntityLinkerProperties properties) throws Exception {
+  public GazetteerSearcher(EntityLinkerProperties properties) throws IOException {
     this.properties = properties;
     init();
   }
@@ -206,7 +206,7 @@ public class GazetteerSearcher {
 
   }
 
-  private void init() throws Exception {
+  private void init() throws IOException {
 
     if (opennlpIndex == null) {
       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Mon Feb  2 22:07:45 2015
@@ -186,7 +186,7 @@ public class GeoEntityLinker implements
 
   @Override
   public void init(EntityLinkerProperties properties) throws IOException {
-    try {
+   
       this.linkerProperties = properties;
       countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
@@ -199,9 +199,7 @@ public class GeoEntityLinker implements
       }
       topN = rws;
       loadScorers();
-    } catch (Exception ex) {
-      throw new IOException(ex);
-    }
+    
   }
 
   @Override

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java Mon Feb  2 22:07:45 2015
@@ -44,7 +44,7 @@ public class GazetteerIndexer {
       i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),
               new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),
               new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),
-              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
+              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20141202.txt.txt"),
               new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
               new File("C:\\temp\\gazetteers\\"),
               new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java Mon Feb  2 22:07:45 2015
@@ -168,6 +168,7 @@ public class CountryProximityScorer impl
           }
         }
       }
+      
       link.getScoreMap().put("countrycontext", score);
     }
     return span;

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java?rev=1656591&r1=1656590&r2=1656591&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java Mon Feb  2 22:07:45 2015
@@ -156,9 +156,12 @@ public class ProvinceProximityScorer imp
      * span is referring to the toponym form this code key>
      */
     Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
+    if (scoreMap.isEmpty()) {
+      return span;
+    }
     for (BaseLink link : span.getLinkedEntries()) {
       //getItemParentId is the country code
-    GazetteerEntry entry = (GazetteerEntry)link;
+      GazetteerEntry entry = (GazetteerEntry) link;
       String spanCountryCode = entry.getProvinceCode();
       if (scoreMap.containsKey(spanCountryCode)) {
 
@@ -184,9 +187,9 @@ public class ProvinceProximityScorer imp
 
   /**
    * takes a map of distances from the toponym to each province mention and
-   * generates a map of scores for each province code. The map is then correlated
-   * to the code of the BaseLink parentid for retrieval. Then the score is added
-   * to the overall list.
+   * generates a map of scores for each province code. The map is then
+   * correlated to the code of the BaseLink parentid for retrieval. Then the
+   * score is added to the overall list.
    *
    * @param distanceMap
    * @param sentences