You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/08/24 02:51:40 UTC

svn commit: r1620096 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: GazetteerSearcher.java GeoEntityLinker.java scoring/PlacetypeScorer.java

Author: markg
Date: Sun Aug 24 00:51:39 2014
New Revision: 1620096

URL: http://svn.apache.org/r1620096
Log:
OPENNLP-706
Added score normalization for all gazetteerEntries across all where clauses for each name, this score is now part of the sort. Also improved the PlacetypeScorer to include the two main USGS gazetteer types Populated Place and CIVIL. Seems to be performing better on test data.

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1620096&r1=1620095&r2=1620096&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Sun Aug 24 00:51:39 2014
@@ -57,7 +57,6 @@ public class GazetteerSearcher {
   private boolean doubleQuoteAllSearchTerms = false;
   private boolean useHierarchyField = false;
 
-
   private EntityLinkerProperties properties;
 
   private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
@@ -124,11 +123,14 @@ public class GazetteerSearcher {
       //Filter filter = new QueryWrapperFilter(new QueryParser(Version.LUCENE_48, whereClause, opennlpAnalyzer).parse(whereClause));      
 
       TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned);
-
+      Double maxscore = 0d;
       for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
         GazetteerEntry entry = new GazetteerEntry();
         int docId = bestDocs.scoreDocs[i].doc;
         double sc = bestDocs.scoreDocs[i].score;
+        if (maxscore.compareTo(sc) < 0) {
+          maxscore = sc;
+        }
         entry.getScoreMap().put("lucene", sc);
         entry.setIndexID(docId + "");
 
@@ -158,19 +160,13 @@ public class GazetteerSearcher {
         for (int idx = 0; idx < fields.size(); idx++) {
           entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));
         }
-        /**
-         * norm the levenstein distance
-         */
-        int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
-
-        Double normLev = Math.abs(1 - (sc / (double) maxLen));
+       
         /**
          * only want hits above the levenstein thresh. This should be a low
          * thresh due to the use of the hierarchy field in the index
          */
         // if (normLev > scoreCutoff) {
         if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
-          entry.getScoreMap().put("normlucene", normLev);
           //make sure we don't produce a duplicate
           if (!linkedData.contains(entry)) {
             linkedData.add(entry);
@@ -182,16 +178,16 @@ public class GazetteerSearcher {
         }
         //}
       }
-
+      
     } catch (IOException | ParseException ex) {
       LOGGER.error(ex);
     }
 
-  
-
     return linkedData;
   }
 
+ 
+
   /**
    * Replaces any noise chars with a space, and depending on configuration adds
    * double quotes to the string
@@ -232,11 +228,9 @@ public class GazetteerSearcher {
       analyMap.put("loctype", new KeywordAnalyzer());
       analyMap.put("countycode", new KeywordAnalyzer());
       analyMap.put("gazsource", new KeywordAnalyzer());
-      
-      
-    opennlpAnalyzer
-            = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
 
+      opennlpAnalyzer
+              = new PerFieldAnalyzerWrapper(opennlpAnalyzer, analyMap);
 
       String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
       String usehierarchy = properties.getProperty("opennlp.geoentitylinker.gaz.hierarchyfield", String.valueOf("0"));

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1620096&r1=1620095&r2=1620096&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Sun Aug 24 00:51:39 2014
@@ -75,6 +75,24 @@ public class GeoEntityLinker implements 
         if (geoNamesEntries.isEmpty()) {
           continue;
         }
+        /**
+         * Normalize the returned scores for this name... this will assist the sort
+         */
+        if (!spans.isEmpty()) {
+
+          Double maxscore = 0d;
+          for (BaseLink gazetteerEntry : geoNamesEntries) {
+            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
+            if (deNormScore.compareTo(maxscore) > 0) {
+              maxscore = deNormScore;
+            }
+          }
+          for (BaseLink gazetteerEntry : geoNamesEntries) {
+            Double deNormScore = gazetteerEntry.getScoreMap().get("lucene");
+            Double normalize = normalize(deNormScore, 0d, maxscore);
+            gazetteerEntry.getScoreMap().put("normlucene", normalize);
+          }
+        }
         LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
         newspan.setSearchTerm(matches[i]);
         newspan.setLinkedEntries(geoNamesEntries);
@@ -109,7 +127,8 @@ public class GeoEntityLinker implements 
             if (object.equals("typescore")
                     || object.equals("countrycontext")
                     || object.equals("placenamedicecoef")
-                    || object.equals("geohashbin")) {
+                    || object.equals("geohashbin")
+                    || object.equals("normlucene")) {
               sumo1 += o1scoreMap.get(object);
               sumo2 += o2scoreMap.get(object);
             }
@@ -124,6 +143,21 @@ public class GeoEntityLinker implements 
     return spans;
   }
 
+  /**
+   * transposes a value within one range to a relative value in a different
+   * range. Used to normalize distances in this class.
+   *
+   * @param valueToNormalize the value to place within the new range
+   * @param minimum the min of the set to be transposed
+   * @param maximum the max of the set to be transposed
+   * @return
+   */
+  private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
+    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+    d = d == Double.NaN ? 0d : d;
+    return d;
+  }
+
   private void loadScorers() {
     if (scorers.isEmpty()) {
       scorers.add(new ProvinceProximityScorer());

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java?rev=1620096&r1=1620095&r2=1620096&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PlacetypeScorer.java Sun Aug 24 00:51:39 2014
@@ -31,7 +31,7 @@ import opennlp.tools.util.Span;
  */
 public class PlacetypeScorer implements LinkedEntityScorer<AdminBoundaryContext> {
 
-  private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
+  private static final String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT civil Populated_Place".split(" ");
   private Map<String, Double> boosetedTypes = new HashMap<>();
 
   public PlacetypeScorer() {
@@ -63,10 +63,15 @@ public class PlacetypeScorer implements 
       for (String type : boosts) {
         if (type.equals("PCLI")) {
           boosetedTypes.put(type.toLowerCase(), 1d);
-        } else if (type.startsWith("P") && !type.equals("PCLI")) {
+        } else if ((type.startsWith("PC")|| type.startsWith("PP")) && !type.equals("PCLI")) {
           boosetedTypes.put(type.toLowerCase(), .5d);
         } else if (type.startsWith("ADM")) {
           boosetedTypes.put(type.toLowerCase(), .75d);
+        }else if (type.toLowerCase().equals("civil")){
+          boosetedTypes.put(type.toLowerCase(), .9d);
+        }
+        if(type.toLowerCase().equals("populated_place")){
+           boosetedTypes.put("Populated Place", .75d);
         }
 
       }