You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/01/18 21:03:54 UTC

svn commit: r1559407 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: CountryContext.java CountryContextHit.java GazateerEntry.java GazateerSearcher.java GeoEntityLinker.java GeoEntityLinkerSetupUtils.java

Author: markg
Date: Sat Jan 18 20:03:54 2014
New Revision: 1559407

URL: http://svn.apache.org/r1559407
Log:
OPENNLP-637
OPENNLP-639
Fixed and optimized GazateerSearcher to cache properly. Added hascode and equals to gazateer entry and ensured no duplicates are returned.

Removed:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java
Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java Sat Jan 18 20:03:54 2014
@@ -16,6 +16,7 @@
 package opennlp.addons.geoentitylinker;
 
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -38,11 +39,19 @@ import opennlp.tools.entitylinker.Entity
 public class CountryContext {
 
   private List<CountryContextEntry> countrydata;
-  private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
-  private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
+  private Map<String, Set<String>> nameCodesMap = new HashMap<>();
+  private Map<String, Set<Integer>> countryMentions = new HashMap<>();
   private Set<CountryContextEntry> countryHits = new HashSet<>();
+  private EntityLinkerProperties properties;
 
-  public CountryContext() {
+  public CountryContext(EntityLinkerProperties properties) throws Exception {
+    this.properties = properties;
+    if (countrydata == null) {
+      String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
+
+      File countryContextFile = new File(path);
+      countrydata = getCountryContextFromFile(countryContextFile);
+    }
   }
 
   public Map<String, Set<Integer>> getCountryMentions() {
@@ -57,10 +66,12 @@ public class CountryContext {
   public Set<CountryContextEntry> getCountryHits() {
     return countryHits;
   }
-/**
- * returns the last name to codes map after calling regexFind
- * @return
- */
+
+  /**
+   * returns the last name to codes map after calling regexFind
+   *
+   * @return
+   */
   public Map<String, Set<String>> getNameCodesMap() {
     return nameCodesMap;
   }
@@ -83,15 +94,12 @@ public class CountryContext {
    * @param properties EntityLinkerProperties for getting database connection
    * @return
    */
-  public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
+  public Map<String, Set<Integer>> regexfind(String docText) {
     countryMentions = new HashMap<>();
     nameCodesMap.clear();
     try {
 
-      if (countrydata == null) {
-        countrydata = getCountryContextFromFile(properties);
-        //   countrydata = getCountryData(properties);
-      }
+
       for (CountryContextEntry entry : countrydata) {
         Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
         Matcher rs = regex.matcher(docText);
@@ -133,9 +141,9 @@ public class CountryContext {
     return countryMentions;
   }
 
-  private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
+  private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {
     List<CountryContextEntry> entries = new ArrayList<>();
-    String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
+    String path = countryContextFile.getPath();
     BufferedReader reader;
 
     try {

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java Sat Jan 18 20:03:54 2014
@@ -17,6 +17,7 @@ package opennlp.addons.geoentitylinker;
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Objects;
 import opennlp.tools.entitylinker.domain.BaseLink;
 
 /**
@@ -116,4 +117,40 @@ public class GazateerEntry extends BaseL
   public void setIndexData(Map<String, String> indexData) {
     this.indexData = indexData;
   }
+
+  @Override
+  public int hashCode() {
+    int hash = 7;
+    hash = 29 * hash + Objects.hashCode(this.latitude);
+    hash = 29 * hash + Objects.hashCode(this.longitude);
+    hash = 29 * hash + Objects.hashCode(this.source);
+    hash = 29 * hash + Objects.hashCode(this.indexID);
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null) {
+      return false;
+    }
+    if (getClass() != obj.getClass()) {
+      return false;
+    }
+    final GazateerEntry other = (GazateerEntry) obj;
+    if (!Objects.equals(this.latitude, other.latitude)) {
+      return false;
+    }
+    if (!Objects.equals(this.longitude, other.longitude)) {
+      return false;
+    }
+    if (!Objects.equals(this.source, other.source)) {
+      return false;
+    }
+    if (!Objects.equals(this.indexID, other.indexID)) {
+      return false;
+    }
+    return true;
+  }
+
+
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java Sat Jan 18 20:03:54 2014
@@ -53,8 +53,11 @@ public class GazateerSearcher {
   private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
   private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
   private Analyzer usgsAnalyzer;
+  private EntityLinkerProperties properties;
 
-  public GazateerSearcher() {
+  public GazateerSearcher(EntityLinkerProperties properties) throws Exception {
+    this.properties = properties;
+    init();
   }
 
   /**
@@ -66,39 +69,26 @@ public class GazateerSearcher {
    *                     lucene indexes are
    * @return
    */
-  public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
+  public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
     ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+    String luceneQueryString = "";
     try {
       /**
        * build the search string Sometimes no country context is found. In this
        * case the code variable will be an empty string
        */
-      String luceneQueryString = !code.equals("")
+      luceneQueryString = !code.equals("")
               ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
               : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
       /**
        * check the cache and go no further if the records already exist
        */
-      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+      ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
       if (get != null) {
-      
+
         return get;
       }
-      if (geonamesIndex == null) {
-        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
-        if(indexloc.equals("")){
-          System.out.println("Geonames Gaz location not found");
-          return linkedData;
-        }
-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-        scoreCutoff = Double.valueOf(cutoff);
-        geonamesIndex = new MMapDirectory(new File(indexloc));
-        geonamesReader = DirectoryReader.open(geonamesIndex);
-        geonamesSearcher = new IndexSearcher(geonamesReader);
-        //TODO: a language code switch statement should be employed here at some point
-        geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
 
-      }
 
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);
@@ -152,19 +142,22 @@ public class GazateerSearcher {
         }
         //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
         if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
-          linkedData.add(entry);
+          if (!linkedData.contains(entry)) {
+            linkedData.add(entry);
+          }
         }
       }
-
-      normalize(linkedData, 0d, maxScore);
-      prune(linkedData);
+      if (!linkedData.isEmpty()) {
+        normalize(linkedData, 0d, maxScore);
+        prune(linkedData);
+      }
     } catch (IOException | ParseException ex) {
       System.err.println(ex);
     }
     /**
      * add the records to the cache for this query
      */
-    GazateerSearchCache.put(searchString, linkedData);
+    GazateerSearchCache.put(luceneQueryString, linkedData);
     return linkedData;
   }
 
@@ -177,43 +170,27 @@ public class GazateerSearcher {
    * @param properties   properties file that states where the lucene indexes
    * @return
    */
-  public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {
+  public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned) {
     ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+    String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
     try {
 
-      String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
+
       /**
        * hit the cache
        */
-      ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+      ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
       if (get != null) {
         //if the name is already there, return the list of cavhed results
         return get;
       }
-      if (usgsIndex == null) {
-        String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
-        if(indexloc.equals("")){
-          System.out.println("USGS Gaz location not found");
-          return linkedData;
-        }
-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
-        scoreCutoff = Double.valueOf(cutoff);
-        usgsIndex = new MMapDirectory(new File(indexloc));
-        usgsReader = DirectoryReader.open(usgsIndex);
-        usgsSearcher = new IndexSearcher(usgsReader);
-        usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
-      }
-
-
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);
       Query q = parser.parse(luceneQueryString);
 
-
       TopDocs search = usgsSearcher.search(q, rowsReturned);
       double maxScore = (double) search.getMaxScore();
 
-
-      for (int i = 0; i < search.scoreDocs.length; ++i) {
+      for (int i = 0; i < search.scoreDocs.length; i++) {
         GazateerEntry entry = new GazateerEntry();
         int docId = search.scoreDocs[i].doc;
         double sc = search.scoreDocs[i].score;
@@ -224,8 +201,6 @@ public class GazateerSearcher {
         entry.setIndexID(docId + "");
         entry.setSource("usgs");
         entry.setItemParentID("us");
-
-
         Document d = usgsSearcher.doc(docId);
         List<IndexableField> fields = d.getFields();
         for (int idx = 0; idx < fields.size(); idx++) {
@@ -250,20 +225,21 @@ public class GazateerSearcher {
           }
           entry.getIndexData().put(fields.get(idx).name(), value);
         }
-        linkedData.add(entry);
-
-
+        if (!linkedData.contains(entry)) {
+          linkedData.add(entry);
+        }
+      }
+      if (!linkedData.isEmpty()) {
+        normalize(linkedData, 0d, maxScore);
+        prune(linkedData);
       }
-
-      normalize(linkedData, 0d, maxScore);
-      prune(linkedData);
     } catch (IOException | ParseException ex) {
       System.err.println(ex);
     }
     /**
      * add the records to the cache for this query
      */
-    GazateerSearchCache.put(searchString, linkedData);
+    GazateerSearchCache.put(luceneQueryString, linkedData);
     return linkedData;
   }
 
@@ -308,4 +284,35 @@ public class GazateerSearcher {
     d = d == null ? 0d : d;
     return d;
   }
+
+  private void init() throws Exception {
+    if (usgsIndex == null) {
+      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+      if (indexloc.equals("")) {
+        System.out.println("USGS Gaz location not found");
+
+      }
+      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+      scoreCutoff = Double.valueOf(cutoff);
+      usgsIndex = new MMapDirectory(new File(indexloc));
+      usgsReader = DirectoryReader.open(usgsIndex);
+      usgsSearcher = new IndexSearcher(usgsReader);
+      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+    }
+    if (geonamesIndex == null) {
+      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+      if (indexloc.equals("")) {
+        System.out.println("Geonames Gaz location not found");
+
+      }
+      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+      scoreCutoff = Double.valueOf(cutoff);
+      geonamesIndex = new MMapDirectory(new File(indexloc));
+      geonamesReader = DirectoryReader.open(geonamesIndex);
+      geonamesSearcher = new IndexSearcher(geonamesReader);
+      //TODO: a language code switch statement should be employed here at some point
+      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+
+    }
+  }
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Sat Jan 18 20:03:54 2014
@@ -30,15 +30,13 @@ import opennlp.tools.entitylinker.Entity
  * scoring techniques to enable resolution. The gazateers are stored in lucene
  * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
  * in this same package.
- *
- *
  */
-public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
+public class GeoEntityLinker implements EntityLinker<LinkedSpan, EntityLinkerProperties> {
 
   private CountryContext countryContext;
   private Map<String, Set<Integer>> countryMentions;
   private EntityLinkerProperties linkerProperties;
-  private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+  private GazateerSearcher gazateerSearcher;
   private List<LinkedEntityScorer> scorers = new ArrayList<>();
   /**
    * Flag for deciding whether to search gaz only for toponyms within countries
@@ -46,8 +44,7 @@ public class GeoEntityLinker implements 
    */
   private Boolean filterCountryContext = true;
 
-  public GeoEntityLinker() {
-    countryContext = new CountryContext();
+  public GeoEntityLinker() throws Exception {
   }
 
   @Override
@@ -57,7 +54,7 @@ public class GeoEntityLinker implements 
     if (linkerProperties == null) {
       throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
     }
-    countryMentions = countryContext.regexfind(doctext, linkerProperties);
+    countryMentions = countryContext.regexfind(doctext);
 
     for (int s = 0; s < sentences.length; s++) {
       Span[] names = namesBySentence[s];
@@ -66,28 +63,33 @@ public class GeoEntityLinker implements 
 
       for (int i = 0; i < matches.length; i++) {
 
-//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
+        /**
+         * nga gazateer is for other than US placenames,don't want to use it if
+         * US is the only country mentioned in the doc
+         *
+         */
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
-        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
-          // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
+        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
+                || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
+        
           if (!countryMentions.keySet().isEmpty()) {
             for (String code : countryMentions.keySet()) {
               if (!code.equals("us")) {
-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code));
               }
             }
           } else {
-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
+            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, ""));
 
           }
 
         }
-        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+        ArrayList<BaseLink> usgsEntries = new ArrayList<>();
         if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
           //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
-          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));
+          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));
         }
-        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
 
         if (!usgsEntries.isEmpty()) {
           geoSpan.getLinkedEntries().addAll(usgsEntries);
@@ -102,21 +104,34 @@ public class GeoEntityLinker implements 
       }
     }
 
+    if (!scorers.isEmpty()) {
+      for (LinkedEntityScorer scorer : scorers) {
+        scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+      }
+    }
+
+    return spans;
+  }
+
+  private void loadScorers() {
     if (scorers.isEmpty()) {
       scorers.add(new FuzzyStringMatchScorer());
       scorers.add(new GeoHashBinningScorer());
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());
     }
-    for (LinkedEntityScorer scorer : scorers) {
-      scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
-    }
-    return spans;
   }
 
   @Override
-  public void setEntityLinkerProperties(EntityLinkerProperties properties) {
-    this.linkerProperties = properties;
+  public void init(EntityLinkerProperties properties) {
+    try {
+      this.linkerProperties = properties;
+      countryContext = new CountryContext(this.linkerProperties);
+      gazateerSearcher = new GazateerSearcher(this.linkerProperties);
+      loadScorers();
+    } catch (Exception ex) {
+      throw new RuntimeException(ex);
+    }
   }
 
   @Override

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java Sat Jan 18 20:03:54 2014
@@ -85,13 +85,13 @@ public class GeoEntityLinkerSetupUtils {
    *                          opennlp.geoentitylinker.countrycontext.filepath
    * @throws IOException
    */
-  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
-    CountryContext context = new CountryContext();
+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
+    CountryContext context = new CountryContext(properties);
     FileWriter writer = new FileWriter(annotationOutFile, true);
     System.out.println("processing " + documents.size() + " documents");
     for (String docText : documents) {
       System.out.append(".");
-      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+      Map<String, Set<Integer>> regexfind = context.regexfind(docText);
       Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
       for (String key : modelCountryContext.keySet()) {
         for (String wordbag : modelCountryContext.get(key)) {