You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/01/12 15:44:55 UTC

svn commit: r1557540 - /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/

Author: markg
Date: Sun Jan 12 14:44:54 2014
New Revision: 1557540

URL: http://svn.apache.org/r1557540
Log:
OPENNLP-579
Many efficiencies. Fails gracefully if any resources are missing (Gazateers, countrycontext data, etc)
Updated  javadocs and comments

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java Sun Jan 12 14:44:54 2014
@@ -37,7 +37,6 @@ import opennlp.tools.entitylinker.Entity
  */
 public class CountryContext {
 
- 
   private List<CountryContextEntry> countrydata;
   private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
   private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
@@ -50,10 +49,18 @@ public class CountryContext {
     return countryMentions;
   }
 
+  /**
+   * returns the last set of hits after calling regexFind
+   *
+   * @return
+   */
   public Set<CountryContextEntry> getCountryHits() {
     return countryHits;
   }
-
+/**
+ * returns the last name to codes map after calling regexFind
+ * @return
+ */
   public Map<String, Set<String>> getNameCodesMap() {
     return nameCodesMap;
   }
@@ -63,10 +70,14 @@ public class CountryContext {
   }
 
   /**
-   * Finds mentions of countries based on a list from MySQL stored procedure
-   * called getCountryList. This method finds country mentions in documents,
-   * which is an essential element of the scoring that is done for geo
-   * linkedspans. Lazily loads the list from the database.
+   * Finds mentions of countries to assist in toponym resolution. Countries are
+   * discovered via regex based on a configured file called
+   * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
+   * the entitylinker.properties file as such:
+   * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
+   *
+   * Finding mentions in documents is very helpful for scoring. Lazily loads the
+   * list from the file.
    *
    * @param docText    the full text of the document
    * @param properties EntityLinkerProperties for getting database connection

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java Sun Jan 12 14:44:54 2014
@@ -18,7 +18,7 @@ package opennlp.addons.geoentitylinker;
 import java.util.Objects;
 
 /**
- *Stores a tuple from mysql that is used to find country mentions in document text.
+ *Stores a tuple from the opennlp.geoentitylinker.countrycontext.txt file, which is used to find country mentions in document text.
  *
  */
 public class CountryContextEntry {

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java Sun Jan 12 14:44:54 2014
@@ -28,7 +28,13 @@ import opennlp.tools.entitylinker.domain
 import opennlp.tools.util.Span;
 
 /**
- * Scores toponyms based on country context as well as fuzzy string matching
+ * Scores toponyms based on their proximity to a country mention. Based on the
+ * heuristic that typonymn mentions are more likely close to their parent
+ * country mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Germany, it is more likely to be Berlin Germany than Berlin
+ * Connecticut.
+ *
+ *
  */
 public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
 
@@ -45,8 +51,7 @@ public class CountryProximityScorer impl
   /**
    * Assigns a score to each BaseLink in each linkedSpan's set of N best
    * matches. Currently the scoring indicates the probability that the toponym
-   * is correct based on the country context in the document and fuzzy string
-   * matching
+   * is correct based on the country context in the document
    *
    * @param linkedData     the linked spans, holds the Namefinder results, and
    *                       the list of BaseLink for each
@@ -101,9 +106,11 @@ public class CountryProximityScorer impl
    */
   private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
     Double score = 0.0;
-    //get the index of the actual span, begining of sentence
-    //should generate tokens from sentence and create a char offset...
-    //could have large sentences due to poor sentence detection or wonky doc text
+    /*
+     * get the index of the actual span, begining of sentence //should generate
+     * tokens from sentence and create a char offset... //could have large
+     * sentences due to poor sentence detection or wonky doc text
+     */
     int sentenceIdx = span.getSentenceid();
     int sentIndexInDoc = sentences[sentenceIdx].getStart();
     /**
@@ -151,7 +158,7 @@ public class CountryProximityScorer impl
           //if so, is it the correct country code for that name?
           if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
             //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
-            //TODO: make this multiplier configurable
+            //TODO: make this smarter, and utilize province/state info in the future to be even more specific
             score = (score + .75) > 1.0 ? 1d : (score + .75);
 
             if (link.getItemParentID().equals(dominantCode)) {
@@ -166,10 +173,10 @@ public class CountryProximityScorer impl
   }
 
   /**
-   * takes a map of distances from the NE to each country mention and generates
-   * a map of scores for each country code. The map is then correlated to teh
-   * correlated to the code of the BaseLink parentid for retrieval. Then the
-   * score is added to the overall.
+   * takes a map of distances from the toponym to each country mention and generates
+   * a map of scores for each country code. The map is then correlated to the
+   * code of the BaseLink parentid for retrieval. Then the
+   * score is added to the overall list.
    *
    * @param distanceMap
    * @param sentences
@@ -216,7 +223,7 @@ public class CountryProximityScorer impl
    * together to smooth out the average, so one distant outlier does not kill
    * the score for an obviously good hit. More elegant solution is possible
    * using Math.pow, and making the score decay with distance by using an
-   * increasing negative exponent
+   * increasing negative exponent (I think)
    *
    * @param normDis the normalized and sorted set of distances as a list
    * @return

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java Sun Jan 12 14:44:54 2014
@@ -26,7 +26,7 @@ import opennlp.tools.util.Span;
 
 /**
  *
- * Generates scores for string comparisons.
+ * Generates scores based on string comparisons levenstein and dice
  */
 public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
 

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java Sun Jan 12 14:44:54 2014
@@ -21,7 +21,7 @@ import opennlp.tools.entitylinker.domain
 
 /**
  *
- * Stores a record from a geographic placenames gazateer
+ * Stores a minimal amount of information from a geographic placenames gazateer
  */
 public class GazateerEntry extends BaseLink {
 
@@ -29,46 +29,91 @@ public class GazateerEntry extends BaseL
   private Double longitude;
   private String source;
   private String indexID;
-  private Map<String, String> indexData=new HashMap<>();
+  private Map<String, String> indexData = new HashMap<>();
 
+  /**
+   * returns the id from the lucene document
+   *
+   * @return
+   */
   public String getIndexID() {
     return indexID;
   }
+  /*
+   * sets the id from the lucene document
+   */
 
   public void setIndexID(String indexID) {
     this.indexID = indexID;
   }
 
+  /**
+   * returns the latitude from the gazateer
+   *
+   * @return
+   */
   public Double getLatitude() {
     return latitude;
   }
 
+  /**
+   * sets the latitude from the gazateer
+   *
+   */
   public void setLatitude(Double latitude) {
     this.latitude = latitude;
   }
 
+  /**
+   * returns the longitude from the gaz
+   *
+   * @return
+   */
   public Double getLongitude() {
     return longitude;
   }
 
+  /**
+   * sets the longitude from the gaz
+   *
+   * @param longitude
+   */
   public void setLongitude(Double longitude) {
     this.longitude = longitude;
   }
 
+  /**
+   * returns the source of the gazateer data
+   *
+   * @return
+   */
   public String getSource() {
     return source;
   }
 
+  /**
+   * sets the source (the source of the gazateer data)
+   *
+   * @param source
+   */
   public void setSource(String source) {
     this.source = source;
   }
 
+  /**
+   * Returns all the other fields in the gazateer in the form of a map
+   *
+   * @return
+   */
   public Map<String, String> getIndexData() {
     return indexData;
   }
 
+  /**
+   * sets the other fields in the gazateer in the form of a map
+   *
+   */
   public void setIndexData(Map<String, String> indexData) {
     this.indexData = indexData;
   }
-  
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java Sun Jan 12 14:44:54 2014
@@ -79,7 +79,13 @@ public class GazateerIndexer {
       }
     }
   }
-
+/**
+ * indexes the USGS or Geonames gazateers.
+ * @param outputIndexDir a DIRECTORY path where you would like to store the output lucene indexes
+ * @param gazateerInputData the file, "as is" that was downloaded from the USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES format
+ * @throws Exception
+ */
   public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
     if (!outputIndexDir.isDirectory()) {
       throw new IllegalArgumentException("outputIndexDir must be a directory.");

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java Sun Jan 12 14:44:54 2014
@@ -21,13 +21,17 @@ import java.util.Map;
 
 /**
  *
- * Caches gazateer query results statically
+ * Caches gazateer query results statically. Clears itself if more than 10000 results are cached.
  */
 public class GazateerSearchCache {
 
   private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();
 
-
+/**
+ * returns the cached entries. Returns null if the query does not exists in the cache
+ * @param searchString
+ * @return
+ */
   public static synchronized ArrayList<GazateerEntry> get(String searchString) {
     return gazCache.get(searchString);
   }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java Sun Jan 12 14:44:54 2014
@@ -39,7 +39,7 @@ import opennlp.tools.entitylinker.Entity
 
 /**
  *
- * Searches Gazateers stored in a MMapDirectory lucene index
+ * Searches Gazateers stored in a MMapDirectory Lucene index
  */
 public class GazateerSearcher {
 
@@ -59,18 +59,19 @@ public class GazateerSearcher {
 
   /**
    *
-   * @param searchString the nameed entity to look up in the lucene index
+   * @param searchString the named entity to look up in the lucene index
    * @param rowsReturned how many rows to allow lucene to return
    * @param code         the country code
-   * @param properties   properties file that states where the lucene indexes
-   *                     are
+   * @param properties   the entitylinker.properties file that states where the
+   *                     lucene indexes are
    * @return
    */
   public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
     ArrayList<GazateerEntry> linkedData = new ArrayList<>();
     try {
       /**
-       * build the search string
+       * build the search string Sometimes no country context is found. In this
+       * case the code variable will be an empty string
        */
       String luceneQueryString = !code.equals("")
               ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
@@ -80,25 +81,28 @@ public class GazateerSearcher {
        */
       ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
       if (get != null) {
+      
         return get;
       }
       if (geonamesIndex == null) {
         String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
-        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
+        if(indexloc.equals("")){
+          System.out.println("Geonames Gaz location not found");
+          return linkedData;
+        }
+        String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
         scoreCutoff = Double.valueOf(cutoff);
         geonamesIndex = new MMapDirectory(new File(indexloc));
         geonamesReader = DirectoryReader.open(geonamesIndex);
         geonamesSearcher = new IndexSearcher(geonamesReader);
+        //TODO: a language code switch statement should be employed here at some point
         geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
 
       }
 
-
-
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);
 
-
       TopDocs search = geonamesSearcher.search(q, rowsReturned);
       double maxScore = (double) search.getMaxScore();
 
@@ -118,6 +122,12 @@ public class GazateerSearcher {
         for (int idx = 0; idx < fields.size(); idx++) {
           String value = d.get(fields.get(idx).name());
           value = value.toLowerCase();
+          /**
+           * these positions map to the required fields in the gaz TODO: allow a
+           * configurable list of columns that map to the GazateerEntry fields,
+           * then users would be able to plug in any gazateer they have (if they
+           * build a lucene index out of it)
+           */
           switch (idx) {
             case 1:
               entry.setItemID(value);
@@ -140,7 +150,7 @@ public class GazateerSearcher {
           }
           entry.getIndexData().put(fields.get(idx).name(), value);
         }
-        //only keep it if the country code is a match
+        //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
         if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
           linkedData.add(entry);
         }
@@ -182,6 +192,10 @@ public class GazateerSearcher {
       }
       if (usgsIndex == null) {
         String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+        if(indexloc.equals("")){
+          System.out.println("USGS Gaz location not found");
+          return linkedData;
+        }
         String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
         scoreCutoff = Double.valueOf(cutoff);
         usgsIndex = new MMapDirectory(new File(indexloc));
@@ -264,15 +278,31 @@ public class GazateerSearcher {
     }
   }
 
+  /**
+   * gets rid of entries that are below the score thresh
+   *
+   * @param linkedData
+   */
   private void prune(ArrayList<GazateerEntry> linkedData) {
     for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
       GazateerEntry ge = itr.next();
+      /**
+       * throw away anything under the configured score thresh
+       */
       if (ge.getScoreMap().get("lucene") < scoreCutoff) {
         itr.remove();
       }
     }
   }
 
+  /**
+   * normalizes the different levenstein scores returned from the query into a
+   *
+   * @param valueToNormalize the raw score
+   * @param minimum          the min of the range of scores
+   * @param maximum          the max of the range
+   * @return the normed score
+   */
   private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
     Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
     d = d == null ? 0d : d;

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Sun Jan 12 14:44:54 2014
@@ -26,8 +26,10 @@ import opennlp.tools.entitylinker.Entity
 import opennlp.tools.entitylinker.EntityLinker;
 
 /**
- * Links location entities to gazatteers. Currently supports gazateers in a
- * MySql database (NGA and USGS)
+ * Links location entities to the USGS and GeoNames gazatteers, and uses several
+ * scoring techniques to enable resolution. The gazateers are stored in lucene
+ * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
+ * in this same package.
  *
  *
  */

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java Sun Jan 12 14:44:54 2014
@@ -37,29 +37,45 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;
 
-
 /**
  *
  * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
  */
 public class GeoEntityLinkerSetupUtils {
+
   public static ModelBasedScorer scorer;
 
   static {
     scorer = new ModelBasedScorer();
   }
-    public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
-      GazateerIndexer indexer = new GazateerIndexer();
-      try {
-        indexer.index(outputIndexDir, gazateerInputData, type);
-      } catch (Exception ex) {
-       ex.printStackTrace();
-      }
+
+  /**
+   * Generates the lucene indexes of the USGS and GEONAMES gazateers.
+   *
+   * @param outputIndexDir    the destination directory of the index. Must be a
+   *                          directory
+   * @param gazateerInputData the input data file. Must be in geonames gaz
+   *                          format, or USGS format
+   * @param type              the type, USGS, or GEONAMES
+   */
+  public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type) {
+    GazateerIndexer indexer = new GazateerIndexer();
+    try {
+      indexer.index(outputIndexDir, gazateerInputData, type);
+    } catch (Exception ex) {
+      ex.printStackTrace();
     }
-    /**
+  }
+
+  /**
+   * Generates a doccat model from proximal features generated from surrounding
+   * context of country mentions. This model is used as a basis for a score
+   * called coutrymodel, which takes the context from around a toponym, and uses
+   * this model to return a score for the country code of the toponym hit in the
+   * gazateer.
    *
    * @param documents         A list of document texts, for best results try to
-   *                          ensure each country you care about will be
+   *                          ensure each country you care about will be well
    *                          represented in the collection
    * @param annotationOutFile the location where the annotated doccat text file
    *                          will be stored
@@ -83,7 +99,7 @@ public class GeoEntityLinkerSetupUtils {
         }
       }
     }
-    System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());
+    System.out.println("Document processing complete. Writing training data to " + annotationOutFile.getAbsolutePath());
     writer.close();
     System.out.println("Building Doccat model...");
     DoccatModel model = null;
@@ -98,7 +114,7 @@ public class GeoEntityLinkerSetupUtils {
       model = DocumentCategorizerME.train("en", sampleStream);
       OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
       model.serialize(modelOut);
-       System.out.println("Model complete!");
+      System.out.println("Model complete!");
     } catch (IOException e) {
       // Failed to read or parse training data, training failed
       e.printStackTrace();
@@ -142,5 +158,4 @@ public class GeoEntityLinkerSetupUtils {
     }
     return featureBags;
   }
-
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java Sun Jan 12 14:44:54 2014
@@ -28,18 +28,19 @@ import opennlp.tools.entitylinker.domain
 import opennlp.tools.util.Span;
 
 /**
- *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality
- * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the
- * Lucene search is set to an appropriate level so this class if not fed poor data.
+ * Scores toponymns based on geographic point binning. Based on the heuristic
+ * that docs are generally about a small amount of locations, so one can detect
+ * outliers by finding those points that are not near the majority
+ *
  */
 public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
-     score( linkedSpans);
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+    score(linkedSpans);
   }
 
-  private  void score(List<LinkedSpan> geospans) {
+  private void score(List<LinkedSpan> geospans) {
     Map<Double, Double> latLongs = new HashMap<Double, Double>();
 
     /**
@@ -50,7 +51,7 @@ public class GeoHashBinningScorer implem
         if (bl instanceof GazateerEntry) {
           GazateerEntry entry = (GazateerEntry) bl;
           latLongs.put(entry.getLatitude(), entry.getLongitude());
-        
+
         }
       }
     }
@@ -77,7 +78,7 @@ public class GeoHashBinningScorer implem
         if (bl instanceof GazateerEntry) {
           GazateerEntry entry = (GazateerEntry) bl;
           geohash = geoHash(entry.getLatitude(), entry.getLongitude());
-        
+
         }
         if (scores.containsKey(geohash)) {
           score = scores.get(geohash);
@@ -158,9 +159,9 @@ public class GeoHashBinningScorer implem
     for (Long l : diffs) {
       sum += l;
     }
-    Long avg=sum;
-    if(!diffs.isEmpty()){
-     avg = sum / diffs.size();
+    Long avg = sum;
+    if (!diffs.isEmpty()) {
+      avg = sum / diffs.size();
     }
 
 
@@ -273,4 +274,3 @@ public class GeoHashBinningScorer implem
     return d;
   }
 }
-

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java Sun Jan 12 14:44:54 2014
@@ -28,6 +28,7 @@ public interface LinkedEntityScorer<T> {
 
 /**
  * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
+ * this method internally affects the reference to linkedSpans that was passed in
  * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
  * @param docText the full text of the document.
  * @param sentenceSpans the sentence spans the correspond to the document text

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java Sun Jan 12 14:44:54 2014
@@ -37,6 +37,7 @@ public class ModelBasedScorer implements
   DocumentCategorizerME documentCategorizerME;
   DoccatModel doccatModel;
   public static final int RADIUS = 100;
+  boolean modelexists = false;
 
   @Override
   public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
@@ -44,8 +45,10 @@ public class ModelBasedScorer implements
       if (doccatModel == null) {
         String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
         if (path.equals("")) {
-          System.err.println(this.getClass().getSimpleName() + ": could not find property \"opennlp.geoentitylinker.modelbasedscorer.modelpath\" : no ModelBasedScoring will be performed");
-
+          if (!modelexists) {
+            System.err.println(this.getClass().getSimpleName() + ": could not find property \"opennlp.geoentitylinker.modelbasedscorer.modelpath\" : no ModelBasedScoring will be performed");
+          }
+          modelexists = true;
           return;
         }
         doccatModel = new DoccatModel(new File(path));