You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/08/15 21:59:04 UTC

svn commit: r1618267 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: GazetteerSearcher.java GeoEntityLinker.java indexing/USGSProcessor.java scoring/FuzzyStringMatchScorer.java

Author: markg
Date: Fri Aug 15 19:59:04 2014
New Revision: 1618267

URL: http://svn.apache.org/r1618267
Log:
OPENNLP-706
Significant fix to the USGS indexing so that state names are properly discovered and weighted, added placename dice coefficient over bigrams to descending sort. 

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1618267&r1=1618266&r2=1618267&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Fri Aug 15 19:59:04 2014
@@ -50,7 +50,6 @@ import org.apache.lucene.analysis.util.C
  */
 public class GazetteerSearcher {
 
-  //private static final String boostedTerms = " AND loctype(ADM1^1 ADM1H^1 ADM2^1 ADM2H^1 ADM3^1 ADM3H^1 ADM4^1 ADM4H^1 ADM5^1 ADMD^1 ADMDH^1 PCLD^1 PCLH^1 PCLI^1 PCLIX^1 TERR^1 PCLIX^1 PPL^1 PPLA^1 PPLA2^1 PPLA3^1 PPLA4^1 PPLC^1 PPLCH^1 PPLF^1 PPLG^1 PPLH^1 PPLL^1 PPLQ^1 PPLR^1 PPLS^1 PPLX^1 STLMT^1) ";
   private final String REGEX_CLEAN = "[^\\p{L}\\p{Nd}]";
   private static final Logger LOGGER = Logger.getLogger(GazetteerSearcher.class);
   private double scoreCutoff = .70;
@@ -74,7 +73,7 @@ public class GazetteerSearcher {
     try {
       boolean b = Boolean.valueOf("true");
 
-      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("italy", 5, " countrycode:it AND gazsource:geonames");
+      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).find("alabama", 5, " countrycode:us AND gazsource:usgs");
     } catch (IOException ex) {
       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
     } catch (Exception ex) {

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1618267&r1=1618266&r2=1618267&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Fri Aug 15 19:59:04 2014
@@ -106,7 +106,7 @@ public class GeoEntityLinker implements 
           for (String object : o1scoreMap.keySet()) {
             if (object.equals("typescore")
                     || object.equals("countrycontext")
-                    || object.equals("normlucene")
+                    || object.equals("placenamedicecoef")
                     || object.equals("geohashbin")) {
               sumo1 += o1scoreMap.get(object);
               sumo2 += o2scoreMap.get(object);

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java?rev=1618267&r1=1618266&r2=1618267&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java Fri Aug 15 19:59:04 2014
@@ -57,6 +57,7 @@ public class USGSProcessor {
 
   public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {
 
+    Map<String, StateCentroid> states = new HashMap<>();
     BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
     List<String> fields = new ArrayList<>();
     int counter = 0;
@@ -93,7 +94,21 @@ public class USGSProcessor {
           countyCode = get.getCountyCode();
         }
         String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;
-       // doc.add(new TextField("countryname", "united states", Field.Store.YES));
+
+        if (states.containsKey(get.getProvinceName())) {
+          StateCentroid entry = states.get(get.getProvinceName());
+          entry.count++;
+          entry.latSum += Double.valueOf(lat);
+          entry.longSum += Double.valueOf(lon);
+        } else {
+          StateCentroid centroid = new StateCentroid();
+          centroid.statecode = get.getProvCode();
+          centroid.count = 1;
+          centroid.latSum = Double.valueOf(lat);
+          centroid.longSum = Double.valueOf(lon);
+          states.put(get.getProvinceName(), centroid);
+        }
+
         doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
         doc.add(new TextField("placename", placeName, Field.Store.YES));
         doc.add(new TextField("latitude", lat, Field.Store.YES));
@@ -114,10 +129,56 @@ public class USGSProcessor {
       }
 
     }
+   
+  
+    for (String state : states.keySet()) {
+      StateCentroid get = states.get(state);
+      Document doc = new Document();
+      doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
+      doc.add(new TextField("placename", state, Field.Store.YES));
+      //calculate a centroid for all the points that were in the state
+      doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
+      doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
+      doc.add(new StringField("loctype", "adm1", Field.Store.YES));
+      doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
+      doc.add(new StringField("countrycode", "us", Field.Store.YES));
+      doc.add(new StringField("countycode", "", Field.Store.YES));
+
+      doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
+      doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+      w.addDocument(doc);
+      
+     // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
+    }
+    Document doc = new Document();
+    doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
+    doc.add(new TextField("placename", "united states", Field.Store.YES));
+    //calculate a centroid for all the points that were in the state
+    doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
+    doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
+    doc.add(new StringField("loctype", "pcli", Field.Store.YES));
+    doc.add(new StringField("admincode", "", Field.Store.YES));
+    doc.add(new StringField("countrycode", "us", Field.Store.YES));
+    doc.add(new StringField("countycode", "", Field.Store.YES));
+
+    doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
+    doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+    //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));
+
+    w.addDocument(doc);
     w.commit();
+
     System.out.println("Completed indexing USGS gaz!");
   }
 
+  private static class StateCentroid {
+
+    double latSum;
+    double longSum;
+    String statecode;
+    int count;
+  }
+
   private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
     System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
     Map<String, AdminBoundary> outmap = new HashMap<>();

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java?rev=1618267&r1=1618266&r2=1618267&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java Fri Aug 15 19:59:04 2014
@@ -42,9 +42,15 @@ public class FuzzyStringMatchScorer impl
           if (hierarchy != null) {
             Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);
             link.getScoreMap().put("hierarchydicecoef", dice);
-            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase().toLowerCase());
+            Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase());
             link.getScoreMap().put("hierarchylevenshtein", ld);
           }
+          String placename = entry.getItemName().toLowerCase();
+           if (placename != null) {
+            Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
+            link.getScoreMap().put("placenamedicecoef", dice);
+            
+          }
         }
       }
     }