You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/07/11 03:04:59 UTC

svn commit: r1609600 [1/2] - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: ./ indexing/ scoring/

Author: markg
Date: Fri Jul 11 01:04:58 2014
New Revision: 1609600

URL: http://svn.apache.org/r1609600
Log:
OPENNLP-706
OPENNLP-707
OPENNLP-708
OPENNLP-709
OPENNLP-710
Addressed each ticket. Also adjusted the package structure a bit to separate responsibility better.

Added:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
      - copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
      - copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
      - copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
      - copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
      - copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
Removed:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.Objects;
+
+/**
+ * Stores an admin boundary down to the US county level. Only US places from the
+ * USGS Gazetteer will have county level info
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundary {
+
+  private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND";
+  private final String countryCode;
+  private final String provinceCode;
+  private final String provinceName;
+  private final String countryName;
+  private final String countyName;
+  private final String countyCode;
+
+  public AdminBoundary(String cc, String ac, String pname, String countryName) {
+    this.countryCode = cc;
+    this.provinceCode = ac;
+    this.provinceName = pname;
+    this.countryName = countryName;
+    this.countyCode = NO_DATA_FOUND_VALUE;
+    this.countyName = NO_DATA_FOUND_VALUE;
+  }
+
+  public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName) {
+    this.countryCode = countryCode;
+    this.provinceCode = provinceCode;
+    this.provinceName = provinceName;
+    this.countryName = countryName;
+    this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;
+    this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;
+  }
+
+  public String getCountryCode() {
+    return countryCode;
+  }
+
+  public String getProvCode() {
+    return provinceCode;
+  }
+
+  public String getProvinceName() {
+    return provinceName;
+  }
+
+  public String getCountryName() {
+    return countryName;
+  }
+
+  public String getCountyName() {
+    return countyName;
+  }
+
+  public String getCountyCode() {
+    return countyCode;
+  }
+
+  @Override
+  public String toString() {
+    return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}';
+  }
+
+  @Override
+  public int hashCode() {
+    int hash = 7;
+    hash = 11 * hash + Objects.hashCode(this.countryCode);
+    hash = 11 * hash + Objects.hashCode(this.provinceCode);
+    hash = 11 * hash + Objects.hashCode(this.provinceName);
+    hash = 11 * hash + Objects.hashCode(this.countryName);
+    hash = 11 * hash + Objects.hashCode(this.countyName);
+    hash = 11 * hash + Objects.hashCode(this.countyCode);
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == null) {
+      return false;
+    }
+    if (getClass() != obj.getClass()) {
+      return false;
+    }
+    final AdminBoundary other = (AdminBoundary) obj;
+    if (!Objects.equals(this.countryCode, other.countryCode)) {
+      return false;
+    }
+    if (!Objects.equals(this.provinceCode, other.provinceCode)) {
+      return false;
+    }
+    if (!Objects.equals(this.provinceName, other.provinceName)) {
+      return false;
+    }
+    if (!Objects.equals(this.countryName, other.countryName)) {
+      return false;
+    }
+    if (!Objects.equals(this.countyName, other.countyName)) {
+      return false;
+    }
+    if (!Objects.equals(this.countyCode, other.countyCode)) {
+      return false;
+    }
+    return true;
+  }
+
+}

Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundaryContext {
+
+  private final Map<String, Set<Integer>> countryMentions;
+  private final Map<String, Set<Integer>> provMentions;
+  private final Map<String, Set<Integer>> countyMentions;
+  private final Set<String> countryHits;
+  private final Set<String> provHits;
+  private final Set<String> countyHits;
+  private final Map<String, String> countryRefMap;
+  private final Map<String, Map<String, String>> provRefMap;
+  private final Map<String, Map<String, String>> countyRefMap;
+  private final Set<String> whereClauses;
+  private final Map<String, Set<String>> nameCodesMap;
+
+  public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,
+          Map<String, Set<Integer>> provMentions,
+          Map<String, Set<Integer>> countyMentions,
+          Set<String> countryHits,
+          Set<String> provHits,
+          Set<String> countyHits,
+          Map<String, String> countryRefMap,
+          Map<String, Map<String, String>> provRefMap,
+          Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap) {
+    this.countryMentions = countryMentions;
+    this.provMentions = provMentions;
+    this.countyMentions = countyMentions;
+    this.countryHits = countryHits;
+    this.provHits = provHits;
+    this.countyHits = countyHits;
+    this.countryRefMap = countryRefMap;
+    this.provRefMap = provRefMap;
+    this.countyRefMap = countyRefMap;
+    this.whereClauses = setWhereClauses();
+    this.nameCodesMap = nameCodesMap;
+  }
+
+  public Map<String, Set<String>> getNameCodesMap() {
+    return nameCodesMap;
+  }
+
+  public Map<String, Set<Integer>> getCountryMentions() {
+    return countryMentions;
+  }
+
+  public Map<String, Set<Integer>> getProvMentions() {
+    return provMentions;
+  }
+
+  public Map<String, Set<Integer>> getCountyMentions() {
+    return countyMentions;
+  }
+
+  public Set<String> getCountryHits() {
+    return countryHits;
+  }
+
+  public Set<String> getProvHits() {
+    return provHits;
+  }
+
+  public Set<String> getCountyHits() {
+    return countyHits;
+  }
+
+  public Map<String, String> getCountryRefMap() {
+    return countryRefMap;
+  }
+
+  public Map<String, Map<String, String>> getProvRefMap() {
+    return provRefMap;
+  }
+
+  public Map<String, Map<String, String>> getCountyRefMap() {
+    return countyRefMap;
+  }
+
+  public Set<String> getWhereClauses() {
+    return whereClauses;
+  }
+
+  private Set<String> setWhereClauses() {
+    Set<String> clauses = new HashSet<>();
+    for (String countryCode : this.getCountryHits()) {
+      String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames";
+      if (countryCode.toLowerCase().matches(".*rg[0-9].*")) {
+        gazType = " AND gazsource:region";
+      }
+      Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode);
+      if (provsForCountry == null) {
+        provsForCountry = new HashMap<>();
+      }
+      Map<String, String> provs = new HashMap<>();
+
+      if (!provsForCountry.isEmpty()) {
+        for (String pcode : provsForCountry.keySet()) {
+          if (this.getProvHits().contains(pcode)) {
+            provs.put(pcode, provsForCountry.get(pcode));
+
+            clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);
+
+          }
+        }
+      }
+      if (provs.isEmpty()) {
+        //got a country with no mentioned provs
+        clauses.add(" countrycode:" + countryCode + gazType);
+      }
+    }
+    return clauses;
+  }
+
+}

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java Fri Jul 11 01:04:58 2014
@@ -25,6 +25,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.logging.Level;
 
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -36,25 +37,61 @@ import org.apache.log4j.Logger;
  * Used to boost or degrade scoring of linked geo entities
  *
  */
-public class CountryContext {
+public class AdminBoundaryContextGenerator {
 
-  private static final Logger LOGGER = Logger.getLogger(CountryContext.class);
+  private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class);
   private List<CountryContextEntry> countrydata;
   private Map<String, Set<String>> nameCodesMap = new HashMap<>();
   private Map<String, Set<Integer>> countryMentions = new HashMap<>();
   private Set<CountryContextEntry> countryHits = new HashSet<>();
   private EntityLinkerProperties properties;
-  
-  public CountryContext(EntityLinkerProperties properties) throws Exception {
+  private List<AdminBoundary> adminBoundaryData;
+  private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();
+  private AdminBoundaryContext context;
+
+  public AdminBoundaryContext getContext(String text) {
+    context = null;
+    nameCodesMap.clear();
+    context = process(text);
+
+    return context;
+  }
+
+  private Set<String> countryHitSet = new HashSet<>();
+  private Map<String, String> countryMap = new HashMap<>();
+  private Map<String, Map<String, String>> provMap = new HashMap<>();
+  private Map<String, Map<String, String>> countyMap = new HashMap<>();
+
+  private Map<String, Set<Integer>> provMentions = new HashMap<>();
+  private Map<String, Set<Integer>> countyMentions = new HashMap<>();
+
+  private Set<String> provHits = new HashSet<String>();
+  private Set<String> countyHits = new HashSet<String>();
+
+  public static void main(String[] args) {
+    try {
+      AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+      GeoEntityLinker linker = new GeoEntityLinker();
+      linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+
+      countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");
+
+    } catch (Exception ex) {
+      java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
+    }
+  }
+
+  public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws Exception {
     this.properties = properties;
     if (countrydata == null) {
       String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-      
+
       File countryContextFile = new File(path);
-      countrydata = getCountryContextFromFile(countryContextFile);
+      //countrydata = getCountryContextFromFile(countryContextFile);
+      adminBoundaryData = getContextFromFile(countryContextFile);
     }
   }
-  
+
   public Map<String, Set<Integer>> getCountryMentions() {
     return countryMentions;
   }
@@ -76,11 +113,95 @@ public class CountryContext {
   public Map<String, Set<String>> getNameCodesMap() {
     return nameCodesMap;
   }
-  
+
   public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
     this.nameCodesMap = nameCodesMap;
   }
 
+  private void reset() {
+    this.nameCodesMap.clear();
+    this.countryHitSet.clear();
+    this.countryHits.clear();
+    this.countryMentions.clear();
+    this.provHits.clear();
+    this.provMentions.clear();
+    this.countyHits.clear();
+    this.countyMentions.clear();
+    this.adminBoundaryHits.clear();
+  }
+
+  /**
+   * Finds indicators of countries, provinces, and cities, as per the USGS and
+   * Geonames gazetteers. The results of this are used to score toponymns
+   * downstream. The full text of a document should be passed in here.
+   *
+   * @param text the full text of the document (block of text).
+   * @return
+   */
+  private AdminBoundaryContext process(String text) {
+    try {
+      if (text.contains("Convoy of terror")) {
+        System.out.println("");
+      }
+      reset();
+      Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);
+      if (!countryhitMap.isEmpty()) {
+        for (String cc : countryhitMap.keySet()) {
+          Map<String, String> provsForCc = provMap.get(cc);
+          if (provsForCc != null) {
+            provMentions = regexfind(text, provsForCc, provHits);
+            if (provMentions != null) {
+              for (String prov : provMentions.keySet()) {
+                Map<String, String> get = countyMap.get(prov);
+                if (get != null) {
+                  countyMentions = regexfind(text, get, countyHits);
+                }
+              }
+            }
+          }
+        }
+      } else {
+        for (Map<String, String> provsForCc : provMap.values()) {
+          if (provsForCc != null) {
+            provMentions = regexfind(text, provsForCc, provHits);
+            if (provMentions != null) {
+              for (String prov : provMentions.keySet()) {
+                //fake a country hit based on a province hit... this gets fuzzy
+                String cc = prov.split("\\.")[0];
+                if (!countryhitMap.containsKey(cc)) {
+                  countryhitMap.put(cc, provMentions.get(prov));
+                  countryHitSet.add(cc);
+                } else {
+                  countryhitMap.get(cc).addAll(provMentions.get(prov));
+                }
+                Map<String, String> get = countyMap.get(prov);
+                if (get != null) {
+                  countyMentions = regexfind(text, get, countyHits);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      Map<String, String> countryRefMap = new HashMap<>();
+
+      for (String c : countryHitSet) {
+        String countryName = countryMap.get(c);
+        if (countryName != null) {
+          countryRefMap.put(c, countryName);
+        }
+      }
+
+      AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, countryRefMap, provMap, countyMap, nameCodesMap);
+
+      return context;
+    } catch (Exception e) {
+      e.printStackTrace();
+    }
+    return null;
+  }
+
   /**
    * Finds mentions of countries to assist in toponym resolution. Countries are
    * discovered via regex based on a configured file called
@@ -92,19 +213,19 @@ public class CountryContext {
    * list from the file.
    *
    * @param docText the full text of the document
-   * @param properties EntityLinkerProperties for getting database connection
    * @return
    */
+  @Deprecated
   public Map<String, Set<Integer>> regexfind(String docText) {
     countryMentions = new HashMap<>();
     nameCodesMap.clear();
     try {
-      
+
       for (CountryContextEntry entry : countrydata) {
         Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
         Matcher rs = regex.matcher(docText);
         String code = entry.getCc1().toLowerCase();
-        
+
         boolean found = false;
         while (rs.find()) {
           found = true;
@@ -130,45 +251,156 @@ public class CountryContext {
         if (found) {
           countryHits.add(entry);
         }
-        
+
       }
-      
+
     } catch (Exception ex) {
       LOGGER.error(ex);
     }
-    
+
     return countryMentions;
   }
-  
-  private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {
-    List<CountryContextEntry> entries = new ArrayList<>();
-    String path = countryContextFile.getPath();
+
+  /**
+   * discovers indicators of admin boundary data using regex.
+   *
+   * @param docText the full text
+   * @param lookupMap a map to use to find names. the key=a location code, the
+   * value is an actual name.
+   * @param hitsRef a reference to a set that stores the hits by id
+   * @return
+   */
+  private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef) {
+    Map<String, Set<Integer>> mentions = new HashMap<>();
+    if (lookupMap == null) {
+      return mentions;
+    }
+    try {
+
+      for (String entry : lookupMap.keySet()) {
+        String name = lookupMap.get(entry).toLowerCase();
+        if (name == null) {
+          continue;
+        }
+        name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";
+        Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+        Matcher rs = regex.matcher(docText);
+        String code = entry.toLowerCase();
+
+        boolean found = false;
+        while (rs.find()) {
+          found = true;
+          Integer start = rs.start();
+          String hit = rs.group().toLowerCase().trim();
+          hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", "");
+          if (mentions.containsKey(code)) {
+            mentions.get(code).add(start);
+          } else {
+            Set<Integer> newset = new HashSet<Integer>();
+            newset.add(start);
+            mentions.put(code, newset);
+          }
+          if (!hit.equals("")) {
+            if (this.nameCodesMap.containsKey(hit)) {
+              nameCodesMap.get(hit).add(code);
+            } else {
+              HashSet<String> newset = new HashSet<String>();
+              newset.add(code);
+              nameCodesMap.put(hit, newset);
+            }
+          }
+
+        }
+        if (found) {
+          hitsRef.add(code);
+
+        }
+      }
+
+    } catch (Exception ex) {
+      LOGGER.error(ex);
+      ex.printStackTrace();
+
+    }
+
+    return mentions;
+  }
+
+  private List<AdminBoundary> getContextFromFile(File countryContextFile) {
+    if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {
+      return adminBoundaryData;
+    }
+    List<AdminBoundary> entries = new ArrayList<>();
     BufferedReader reader;
-    
     try {
-      path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-      
-      reader = new BufferedReader(new FileReader(path));
-      
-      while (reader.read() != -1) {
-        String line = reader.readLine();
+      reader = new BufferedReader(new FileReader(countryContextFile));
+      String line = "";
+      while ((line = reader.readLine()) != null) {
         String[] values = line.split("\t");
-        if (values.length != 4) {
-          throw new IOException("improperly formatted country context file");
+        int len = values.length;
+        if (len < 5 || len > 6) {
+          throw new IllegalArgumentException("Improperly formatted file");
+        }
+        if (values.length == 6) {
+          AdminBoundary entry = new AdminBoundary(
+                  values[0].toLowerCase().trim(),
+                  values[3].toLowerCase().trim(),
+                  values[1].toLowerCase().trim(),
+                  values[4].toLowerCase().trim(),
+                  values[2].toLowerCase().trim(),
+                  values[5].toLowerCase().trim());
+          entries.add(entry);
+        } else {
+          AdminBoundary entry = new AdminBoundary(
+                  values[0].toLowerCase().trim(),
+                  values[3].toLowerCase().trim(),
+                  values[1].toLowerCase().trim(),
+                  values[4].toLowerCase().trim(),
+                  values[2].toLowerCase().trim(),
+                  "");
+          entries.add(entry);
         }
-        CountryContextEntry entry = new CountryContextEntry();
-        // rc,cc1, full_name_nd_ro,dsg
-        entry.setRc(values[0].toLowerCase());
-        entry.setCc1(values[1].toLowerCase());
-        entry.setFull_name_nd_ro(values[2].toLowerCase());
-        entry.setDsg(values[3].toLowerCase());
-        entries.add(entry);
+
       }
       reader.close();
     } catch (IOException ex) {
       LOGGER.error(ex);
     }
+    loadMaps(entries);
     return entries;
-    
+
   }
+
+  private void loadMaps(List<AdminBoundary> boundaries) {
+    for (AdminBoundary adm : boundaries) {
+      if (!adm.getCountryCode().equals("null")) {
+        countryMap.put(adm.getCountryCode(), adm.getCountryName());
+
+        if (!adm.getProvCode().equals("null")) {
+          Map<String, String> provs = provMap.get(adm.getCountryCode());
+          if (provs == null) {
+            provs = new HashMap<>();
+          }
+          //if (!provs.containsKey(adm.getProvCode())) {
+          provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName());
+          provMap.put(adm.getCountryCode(), provs);
+          // }
+
+          if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) {
+            Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode());
+            if (counties == null) {
+              counties = new HashMap<>();
+            }            // if (!counties.containsKey(adm.getCountyCode())) {
+            String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode();
+            counties.put(countyid, adm.getCountyName());
+            countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties);
+            // }
+
+          }
+
+        }
+      }
+    }
+  }
+
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java Fri Jul 11 01:04:58 2014
@@ -31,6 +31,9 @@ public class GazetteerEntry extends Base
   private String source;
   private String indexID;
   private Map<String, String> indexData = new HashMap<>();
+  private String countryCode;
+  private String provinceCode;
+  private String hierarchy;
 
   /**
    * returns the id from the lucene document
@@ -159,5 +162,28 @@ public class GazetteerEntry extends Base
     return true;
   }
 
+  public String getCountryCode() {
+    return countryCode;
+  }
+
+  public void setCountryCode(String countryCode) {
+    this.countryCode = countryCode;
+  }
+
+  public String getProvinceCode() {
+    return provinceCode;
+  }
+
+  public void setProvinceCode(String provinceCode) {
+    this.provinceCode = provinceCode;
+  }
+
+  public String getHierarchy() {
+    return hierarchy;
+  }
+
+  public void setHierarchy(String hierarchy) {
+    this.hierarchy = hierarchy;
+  }
 
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java Fri Jul 11 01:04:58 2014
@@ -28,7 +28,7 @@ public class GazetteerSearchCache {
   private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>();
 
 /**
- * returns the cached entries. Returns null if the query does not exists in the cache
+ * returns the cached entries. Returns null if the query does not exist in the cache
  * @param searchString
  * @return
  */

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Fri Jul 11 01:04:58 2014
@@ -38,6 +38,7 @@ import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import org.apache.log4j.Logger;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.search.Sort;
 
 /**
  *
@@ -63,11 +64,16 @@ public class GazetteerSearcher {
   private Analyzer usgsAnalyzer;
   private EntityLinkerProperties properties;
 
+  private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
+  private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);
+  private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);
+  private Analyzer opennlpAnalyzer;
+
   public static void main(String[] args) {
     try {
       boolean b = Boolean.valueOf("true");
 
-      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
+      new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");
     } catch (IOException ex) {
       java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
     } catch (Exception ex) {
@@ -79,6 +85,112 @@ public class GazetteerSearcher {
     this.properties = properties;
     init();
   }
+/**
+ * Searches the single lucene index that includes the location hierarchy.
+ * @param searchString the location name to search for
+ * @param rowsReturned how many index entries to return (top N...)
+ * @param whereClause the conditional statement that defines the index type and the country oode.
+ * @return 
+ */
+  public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {
+    ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
+    searchString = cleanInput(searchString);
+    if (searchString.isEmpty()) {
+      return linkedData;
+    }
+    try {
+      /**
+       * build the search string Sometimes no country context is found. In this
+       * case the code variables will be empty strings
+       */
+      String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;
+      if (searchString.trim().contains(" ")) {
+        placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
+                + " AND " + whereClause;
+      }
+
+      //  luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";
+      /**
+       * check the cache and go no further if the records already exist
+       */
+      ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(placeNameQueryString);
+      if (get != null) {
+
+        return get;
+      }
+      /**
+       * search the placename
+       */
+      QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);
+      Query q = parser.parse(placeNameQueryString);
+      
+      TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);
+  
+      for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
+        GazetteerEntry entry = new GazetteerEntry();
+        int docId = bestDocs.scoreDocs[i].doc;
+        double sc = bestDocs.scoreDocs[i].score;
+
+        entry.getScoreMap().put("lucene", sc);
+        entry.setIndexID(docId + "");
+
+        Document d = opennlpSearcher.doc(docId);
+
+        List<IndexableField> fields = d.getFields();
+
+        String lat = d.get("latitude");
+        String lon = d.get("longitude");
+        String placename = d.get("placename");
+        String parentid = d.get("countrycode").toLowerCase();
+        String provid = d.get("admincode");
+        String itemtype = d.get("loctype");
+        String source = d.get("gazsource");
+        String hier = d.get("hierarchy");
+        entry.setSource(source);
+
+        entry.setItemID(docId + "");
+        entry.setLatitude(Double.valueOf(lat));
+        entry.setLongitude(Double.valueOf(lon));
+        entry.setItemType(itemtype);
+        entry.setItemParentID(parentid);
+        entry.setProvinceCode(provid);
+        entry.setCountryCode(parentid);
+        entry.setItemName(placename);
+        entry.setHierarchy(hier);
+        for (int idx = 0; idx < fields.size(); idx++) {
+          entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));
+        }
+        /**
+         * norm the levenstein distance
+         */
+        int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
+
+        Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
+        /**
+         * only want hits above the levenstein thresh. This should be a low
+         * thresh due to the use of the hierarchy field in the index
+         */
+        if (normLev.compareTo(scoreCutoff) >= 0) {
+//          if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
+          entry.getScoreMap().put("normlucene", normLev);
+          //make sure we don't produce a duplicate
+          if (!linkedData.contains(entry)) {
+            linkedData.add(entry);
+            /**
+             * add the records to the cache for this query
+             */
+            GazetteerSearchCache.put(placeNameQueryString, linkedData);
+          }
+//          }
+        }
+      }
+
+    } catch (IOException | ParseException ex) {
+      LOGGER.error(ex);
+    }
+
+    return linkedData;
+  }
 
   /**
    *
@@ -88,6 +200,7 @@ public class GazetteerSearcher {
    *
    * @return
    */
+  @Deprecated
   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
@@ -198,6 +311,7 @@ public class GazetteerSearcher {
    *
    * @return
    */
+    @Deprecated
   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
     searchString = cleanInput(searchString);
@@ -284,7 +398,8 @@ public class GazetteerSearcher {
   }
 
   /**
-   * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
+   * Replaces any noise chars with a space, and depending on configuration adds
+   * double quotes to the string
    *
    * @param input
    * @return
@@ -300,36 +415,66 @@ public class GazetteerSearcher {
   }
 
   private void init() throws Exception {
-    if (usgsIndex == null) {
-      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+//    if (usgsIndex == null) {
+//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+//      if (indexloc.equals("")) {
+//        // System.out.println("USGS Gaz location not found");
+//        LOGGER.error(new Exception("USGS Gaz location not found"));
+//      }
+//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+//
+//      scoreCutoff = Double.valueOf(cutoff);
+//      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
+//      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
+//      usgsIndex = new MMapDirectory(new File(indexloc));
+//      usgsReader = DirectoryReader.open(usgsIndex);
+//      usgsSearcher = new IndexSearcher(usgsReader);
+//      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+//    }
+//    if (geonamesIndex == null) {
+//      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+//      if (indexloc.equals("")) {
+//        LOGGER.error(new Exception("Geonames Gaz location not found"));
+//
+//      }
+//      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+//      scoreCutoff = Double.valueOf(cutoff);
+//      geonamesIndex = new MMapDirectory(new File(indexloc));
+//      geonamesReader = DirectoryReader.open(geonamesIndex);
+//      geonamesSearcher = new IndexSearcher(geonamesReader);
+//      //TODO: a language code switch statement should be employed here at some point
+//      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+//
+//    }
+    if (opennlpIndex == null) {
+      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");
       if (indexloc.equals("")) {
-        // System.out.println("USGS Gaz location not found");
-        LOGGER.error(new Exception("USGS Gaz location not found"));
+        LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));
+
       }
-      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+      //  String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+      //  scoreCutoff = Double.valueOf(cutoff);
+      opennlpIndex = new MMapDirectory(new File(indexloc));
+      opennlpReader = DirectoryReader.open(opennlpIndex);
+      opennlpSearcher = new IndexSearcher(opennlpReader);
+      //TODO: a language code switch statement should be employed here at some point
+      opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
 
-      scoreCutoff = Double.valueOf(cutoff);
-      String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
-      doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
-      usgsIndex = new MMapDirectory(new File(indexloc));
-      usgsReader = DirectoryReader.open(usgsIndex);
-      usgsSearcher = new IndexSearcher(usgsReader);
-      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
     }
-    if (geonamesIndex == null) {
-      String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
-      if (indexloc.equals("")) {
-        LOGGER.error(new Exception("Geonames Gaz location not found"));
+  }
 
+  private String formatForHierarchy(String searchTerm) {
+    String[] parts = searchTerm.split(" ");
+    String out = "";
+    if (parts.length != 0) {
+      for (String string : parts) {
+        out += string + " AND ";
       }
-      String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
-      scoreCutoff = Double.valueOf(cutoff);
-      geonamesIndex = new MMapDirectory(new File(indexloc));
-      geonamesReader = DirectoryReader.open(geonamesIndex);
-      geonamesSearcher = new IndexSearcher(geonamesReader);
-      //TODO: a language code switch statement should be employed here at some point
-      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-
+      out = out.substring(0, out.lastIndexOf(" AND "));
+    } else {
+      out = cleanInput(searchTerm);
     }
+    return out;
   }
+
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Fri Jul 11 01:04:58 2014
@@ -15,6 +15,11 @@
  */
 package opennlp.addons.geoentitylinker;
 
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
+import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
+import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
+import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
+import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -33,11 +38,11 @@ import opennlp.tools.entitylinker.Entity
  */
 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
 
-  private CountryContext countryContext;
+  private AdminBoundaryContextGenerator countryContext;
   private Map<String, Set<Integer>> countryMentions;
   private EntityLinkerProperties linkerProperties;
   private GazetteerSearcher gazateerSearcher;
-  private List<LinkedEntityScorer> scorers = new ArrayList<>();
+  private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();
 
   @Override
   public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -46,8 +51,8 @@ public class GeoEntityLinker implements 
     if (linkerProperties == null) {
       throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
     }
-    countryMentions = countryContext.regexfind(doctext);
-
+    //countryMentions = countryContext.regexfind(doctext);
+    AdminBoundaryContext context = countryContext.getContext(doctext);
     for (int s = 0; s < sentences.length; s++) {
       Span[] names = namesBySentence[s];
       String[] tokens = tokensBySentence[s];
@@ -55,51 +60,27 @@ public class GeoEntityLinker implements 
 
       for (int i = 0; i < matches.length; i++) {
 
-        /**
-         * nga gazateer is for other than US placenames,don't want to use it if
-         * US is the only country mentioned in the doc
-         *
-         */
         ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
-        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
-                || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
-
-          if (!countryMentions.keySet().isEmpty()) {
-            for (String code : countryMentions.keySet()) {
-              if (!code.equals("us")) {
-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
-              }
-            }
-          } else {
-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
-
+        if (!context.getWhereClauses().isEmpty()) {
+          for (String whereclause : context.getWhereClauses()) {
+            geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));
           }
-
-        }
-        ArrayList<BaseLink> usgsEntries = new ArrayList<>();
-        if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
-          //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
-          usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));
-        }
-        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd(), "location",names[i].getProb());
-    
-
-        if (!usgsEntries.isEmpty()) {
-          geoSpan.getLinkedEntries().addAll(usgsEntries);
-          geoSpan.setSearchTerm(matches[i]);
-        }
-
-        if (!geoSpan.getLinkedEntries().isEmpty()) {
-          geoSpan.setSearchTerm(matches[i]);
-          geoSpan.setSentenceid(s);
-          spans.add(geoSpan);
+        }else{//this means there were no where clauses generated so the where clause will default to look at the entire index
+          geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));
         }
+        //start generating queries
+        LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
+        newspan.setSearchTerm(matches[i]);
+        newspan.setLinkedEntries(geoNamesEntries);
+        newspan.setSentenceid(s);
+        spans.add(newspan);
       }
+
     }
 
     if (!scorers.isEmpty()) {
       for (LinkedEntityScorer scorer : scorers) {
-        scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+        scorer.score(spans, doctext, sentences, linkerProperties, context);
       }
     }
 
@@ -111,6 +92,8 @@ public class GeoEntityLinker implements 
       scorers.add(new GeoHashBinningScorer());
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());
+      scorers.add(new FuzzyStringMatchScorer());
+     // scorers.add(new ProvinceProximityScorer());
     }
   }
 
@@ -118,7 +101,7 @@ public class GeoEntityLinker implements 
   public void init(EntityLinkerProperties properties) {
     try {
       this.linkerProperties = properties;
-      countryContext = new CountryContext(this.linkerProperties);
+      countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
       gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
       loadScorers();
     } catch (Exception ex) {

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java Fri Jul 11 01:04:58 2014
@@ -13,20 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.indexing;
 
-import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileReader;
+import java.io.FileNotFoundException;
 import java.util.ArrayList;
-import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.store.Directory;
@@ -39,11 +34,26 @@ import org.apache.lucene.util.Version;
  */
 public class GazetteerIndexer {
 
+  public static void main(String[] args) {
+    try {
+      GazetteerIndexer i = new GazetteerIndexer();
+      i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),
+              new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),
+              new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),
+              new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
+              new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
+              new File("C:\\temp\\gazetteers\\"),
+              new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),
+              new File("C:\\temp\\gazetteers\\regions.txt"));
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+  }
+
   public GazetteerIndexer() {
 
   }
 
-
   public static interface Separable {
 
     String getSeparator();
@@ -76,64 +86,142 @@ public class GazetteerIndexer {
   }
 
   /**
-   * indexes the USGS or Geonames gazateers.
    *
-   * @param outputIndexDir     a DIRECTORY path where you would like to store
-   *                           the output lucene indexes
-   * @param gazetteerInputData the file, "as is" that was downloaded from the
-   *                           USGS and GEONAMES website
-   * @param type               indicates whether the data is USGS or GEONAMES
-   *                           format
+   * @param geonamesData the actual Geonames gazetteer data downloaded from
+   * here: http://download.geonames.org/export/dump/ then click on this
+   * link 'allCountries.zip'
+   * @param geoNamesCountryInfo the countryinfo lookup table that can be
+   * downloaded from here
+   * http://download.geonames.org/export/dump/countryinfo.txt
+   * @param geonamesAdmin1CodesASCII The lookup data for the province names for
+   * each place found here:
+   * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the
+   * table view, and copy results into a text file. Make sure the tab delimitted
+   * format is maintained.
+   * @param usgsDataFile the actual USGS gazetteer downloaded from here:
+   * http://geonames.usgs.gov/domestic/download_data.htm click on the
+   * national_file####.zip link to get all the most recent features
+   *
+   * @param usgsGovUnitsFile go to here:
+   * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
+   * "Topical Gazetteers -- File Format" click on the drop down list and select
+   * "Government Units". The downloaded file is what you need for this param.
+   * @param outputIndexDir where you want the final index. Must be a directory,
+   * not an actual file.
+   * @param outputCountryContextFile The output countrycontext file. THis is a
+   * very important file used inside the GeoEntityLinker to assist in toponym
+   * resolution.
+   * @param regionsFile this file contains a list of regions in the following
+   * format: tab delimitted text with index 0 as the name of the region, index 1
+   * as the longitude, and index 2 as the latitude
    * @throws Exception
    */
-  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
+  public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII,
+          File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {
     if (!outputIndexDir.isDirectory()) {
       throw new IllegalArgumentException("outputIndexDir must be a directory.");
     }
+    if (!geonamesData.exists()) {
+      throw new FileNotFoundException("geonames data file does not exist");
+    }
+    if (!geoNamesCountryInfo.exists()) {
+      throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
+    }
+    if (!geonamesAdmin1CodesASCII.exists()) {
+      throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
+    }
 
-    String indexloc = outputIndexDir + type.toString();
+    if (!usgsDataFile.exists()) {
+      throw new FileNotFoundException("usgsDataFile data file does not exist");
+    }
+    if (!usgsGovUnitsFile.exists()) {
+      throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
+    }
+    if (!outputIndexDir.exists()) {
+      throw new FileNotFoundException("outputIndexDir data file does not exist");
+    }
+    if (!regionsFile.exists()) {
+      throw new FileNotFoundException("regionsFile data file does not exist");
+    }
+
+    String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
     Directory index = new MMapDirectory(new File(indexloc));
 
     Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
 
     IndexWriter w = new IndexWriter(index, config);
+    USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);
 
-    readFile(gazetteerInputData, w, type);
+    GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);
+
+    RegionProcessor.process(regionsFile, outputCountryContextFile, w);
     w.commit();
     w.close();
-
+    System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" + outputCountryContextFile.getPath() + "' to entitylinker.properties file");
   }
 
-  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
-    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
-    List<String> fields = new ArrayList<>();
-    int counter = 0;
-    System.out.println("reading gazetteer data from file...........");
-    while (reader.read() != -1) {
-      String line = reader.readLine();
-      String[] values = line.split(type.getSeparator());
-      if (counter == 0) {
-        for (String columnName : values) {
-          fields.add(columnName.replace("»¿", "").trim());
-        }
-
-      } else {
-        Document doc = new Document();
-        for (int i = 0; i < fields.size() - 1; i++) {         
-          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
-        }     
-        w.addDocument(doc);
-      }
-      counter++;
-      if (counter % 100000 == 0) {
-        w.commit();
-        System.out.println(counter + " .........committed to index..............");
-      }
+  /**
+   * indexes the USGS or Geonames gazateers.
+   *
+   * @param outputIndexDir a DIRECTORY path where you would like to store the
+   * output lucene indexes
+   * @param gazetteerInputData the file, "as is" that was downloaded from the
+   * USGS and GEONAMES website
+   * @param type indicates whether the data is USGS or GEONAMES format
+   * @throws Exception
+   */
+  @Deprecated
+  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
+    if (!outputIndexDir.isDirectory()) {
+      throw new IllegalArgumentException("outputIndexDir must be a directory.");
 
     }
+
+    String indexloc = outputIndexDir + type.toString();
+    Directory index = new MMapDirectory(new File(indexloc));
+
+    Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
+
+    IndexWriter w = new IndexWriter(index, config);
+    //  GeonamesProcessor.process(new File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), gazetteerInputData, null, w);
+    // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);
+    //  readFile(gazetteerInputData, w, type);
     w.commit();
-    System.out.println("Completed indexing gaz! index name is: " + type.toString());
+    w.close();
+
   }
+//
+//  public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
+//    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+//    List<String> fields = new ArrayList<>();
+//    int counter = 0;
+//    System.out.println("reading gazetteer data from file...........");
+//    while (reader.read() != -1) {
+//      String line = reader.readLine();
+//      String[] values = line.split(type.getSeparator());
+//      if (counter == 0) {
+//        for (String columnName : values) {
+//          fields.add(columnName.replace("»¿", "").trim());
+//        }
+//
+//      } else {
+//        Document doc = new Document();
+//        for (int i = 0; i < fields.size() - 1; i++) {
+//          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+//        }
+//        w.addDocument(doc);
+//      }
+//      counter++;
+//      if (counter % 100000 == 0) {
+//        w.commit();
+//        System.out.println(counter + " .........committed to index..............");
+//      }
+//
+//    }
+//    w.commit();
+//    System.out.println("Completed indexing gaz! index name is: " + type.toString());
+//  }
 
 }

Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.indexing;
 
 import java.io.BufferedOutputStream;
 import java.io.File;
@@ -28,6 +28,8 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContextGenerator;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
 
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
@@ -41,6 +43,7 @@ import opennlp.tools.util.PlainTextByLin
  *
  * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
  */
+@Deprecated
 public class GeoEntityLinkerSetupUtils {
   private static final int RADIUS = 200;
   public static ModelBasedScorer scorer;
@@ -86,7 +89,7 @@ public class GeoEntityLinkerSetupUtils {
    * @throws IOException
    */
   public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
-    CountryContext context = new CountryContext(properties);
+    AdminBoundaryContextGenerator context = new AdminBoundaryContextGenerator(properties);
     FileWriter writer = new FileWriter(annotationOutFile, true);
     System.out.println("processing " + documents.size() + " documents");
     for (String docText : documents) {
@@ -131,7 +134,7 @@ public class GeoEntityLinkerSetupUtils {
    * @param radius
    * @return
    */
-  private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+  private static Map<String, ArrayList<String>> modelCountryContext(String docText, AdminBoundaryContextGenerator additionalContext, int radius) {
     Map<String, ArrayList< String>> featureBags = new HashMap<>();
     Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
     /**

Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+/**
+ *
+ * @author mgiaconia
+ */
+import java.io.*;
+import java.net.*;
+import java.util.Enumeration;
+import java.util.zip.*;
+public class GeonamesFileDownloader {
+
+  final static int size = 1024;
+  private static final String ALL_COUNTRIES = "http://download.geonames.org/export/dump/ZM.zip";
+  private static final String COUNTRY_INFO = "";
+  private static final String ADM1_LOOKUP = "";
+
+  public static void main(String[] args) {
+    downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");
+  }
+
+  public static void downloadGeonamesFiles(String outputFileName, String outputDir) {
+    String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);
+
+    unzipMyZip(fileDownload, outputDir);
+
+    fileDownload(COUNTRY_INFO, outputDir);
+    fileDownload(ADM1_LOOKUP, outputDir);
+
+  }
+
+  public static final void writeFile(InputStream in, OutputStream out)
+          throws IOException {
+    byte[] buffer = new byte[1024];
+    int len;
+
+    while ((len = in.read(buffer)) != 0) {
+      out.write(buffer, 0, len);
+    }
+
+    in.close();
+    out.close();
+  }
+
+  public static void unzipMyZip(String zipFileName,
+          String directoryToExtractTo) {
+    Enumeration entriesEnum;
+    ZipFile zip;
+    try {
+      zip = new ZipFile(zipFileName);
+      entriesEnum = zip.entries();
+      while (entriesEnum.hasMoreElements()) {
+        ZipEntry entry = (ZipEntry) entriesEnum.nextElement();
+        InputStream is = zip.getInputStream(entry); // get the input stream
+        OutputStream os = new java.io.FileOutputStream(new File(zipFileName.replace("\\.zip", ".txt")));
+        byte[] buf = new byte[4096];
+        int r;
+        while ((r = is.read(buf)) != -1) {
+          os.write(buf, 0, r);
+        }
+        os.close();
+        is.close();
+      }
+    } catch (IOException ioe) {
+      System.err.println("Some Exception Occurred:");
+      ioe.printStackTrace();
+      return;
+    }
+  }
+
+  public static String fileUrl(String fAddress, String localFileName, String destinationDir) {
+    OutputStream outStream = null;
+    URLConnection uCon = null;
+    String filename = destinationDir + "\\" + localFileName;
+    InputStream is = null;
+    try {
+      URL Url;
+      byte[] buf;
+      int ByteRead, ByteWritten = 0;
+      Url = new URL(fAddress);
+      outStream = new BufferedOutputStream(new FileOutputStream(destinationDir + "\\" + localFileName));
+
+      uCon = Url.openConnection();
+      is = uCon.getInputStream();
+      buf = new byte[size];
+      while ((ByteRead = is.read(buf)) != -1) {
+        outStream.write(buf, 0, ByteRead);
+        ByteWritten += ByteRead;
+      }
+      System.out.println("Downloaded Successfully.");
+      System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + ByteWritten);
+    } catch (Exception e) {
+      e.printStackTrace();
+    } finally {
+      try {
+        is.close();
+        outStream.close();
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
+    }
+    return filename;
+  }
+
+  public static String fileDownload(String fAddress, String destinationDir) {
+    int slashIndex = fAddress.lastIndexOf('/');
+    int periodIndex = fAddress.lastIndexOf('.');
+
+    String fileName = fAddress.substring(slashIndex + 1);
+    String retFileName = "";
+    if (periodIndex >= 1 && slashIndex >= 0
+            && slashIndex < fAddress.length() - 1) {
+      retFileName = fileUrl(fAddress, fileName, destinationDir);
+    } else {
+      System.err.println("path or file name.");
+    }
+    return retFileName;
+  }
+
+}

Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class GeonamesProcessor {
+
+  public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception {
+    Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
+
+    Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes);
+    //  List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
+    //admin2Codes.txt
+
+    readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w);
+    //now append to the coutnry context file
+    writeCountryContextFile(outputCountryContextFile, adm1s);
+
+  }
+
+  public GeonamesProcessor() {
+  }
+
+  private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) {
+    System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+    Map<String, AdminBoundary> outmap = new HashMap<>();
+    BufferedReader reader;
+    Set<String> nullcodes = new HashSet<>();
+    try {
+
+      reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
+      int i = 0;
+      String line = "";
+      while ((line = reader.readLine()) != null) {
+
+        // String line = reader.readLine();
+        String[] values = line.split("\t");
+        if (values.length != 4) {
+          throw new IOException("improperly formatted province lookup file");
+        }
+        String ccode = values[0].toLowerCase();
+
+        String[] split = ccode.split("\\.");
+        String pcode = "";
+        if (split.length == 2) {
+          //System.out.println(split);
+          ccode = split[0];
+          pcode = split[1];
+        }
+
+        String pname = values[2];
+
+        if (ccode.matches("[0-9].*")) {
+          String code = ccode;
+          ccode = pcode;
+          pcode = code;
+        }
+
+        String cname = ccodes.get(ccode);
+
+        if (cname == null) {
+          nullcodes.add(ccode);
+        }
+        AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);
+        //  System.out.println(data);
+        outmap.put(ccode + "." + pcode, data);
+
+      }
+      System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data.");
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+    return outmap;
+
+  }
+
+  private static Map<String, String> getCountryCodes(File countryContextFile) {
+    Map<String, String> ccs = new HashMap<>();
+    BufferedReader reader;
+    try {
+
+      reader = new BufferedReader(new FileReader(countryContextFile));
+      int i = 0;
+      String line = "";
+      boolean start = false;
+      while ((line = reader.readLine()) != null) {
+        if (!line.toLowerCase().startsWith("#iso\t") && !start) {
+
+          continue;
+        } else {
+          start = true;
+        }
+        String[] values = line.split("\t");
+
+        String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
+        String cname = values[4].toLowerCase();
+        if (!ccode.equals("")) {
+          ccs.put(ccode, cname);
+        }
+
+      }
+      reader.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    ccs.put("SS", "South Sudan");
+    ccs.put("CS", "Kosovo");
+    return ccs;
+
+  }
+
+  public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
+    // FileWriter writer = null;
+    try (FileWriter writer = new FileWriter(outfile, true)) {
+
+      for (String admKey : adms.keySet()) {
+        AdminBoundary adm = adms.get(admKey);
+        if (adm == null) {
+          continue;
+        }
+        String province = adm.getProvinceName();
+        String country = adm.getCountryName();
+
+        String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";
+        writer.write(line);
+        // System.out.println(line);
+
+      }
+      writer.close();
+    } catch (IOException ex) {
+      ex.printStackTrace();
+    }
+    System.out.println("successfully wrote Geonames entries to country oontext file");
+  }
+
+ /**
+  * 
+  * @param gazateerInputData the Geonames allCounties.txt file
+  * @param type the types of gaz entry, usgs, geonames, or regions
+  * @param adms the province info
+  * @param countrycodes the country code info
+  * @param w the lucene index writer
+  * @throws Exception 
+  */
+  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {
+
+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+    String[] fieldStrings = new String[]{
+      "geonameid",
+      "name",
+      "asciiname",
+      "alternatenames",
+      "latitude",
+      "longitude",
+      "feature_class",
+      "feature_code",
+      "country code",
+      "cc2",
+      "admin1_code",
+      "admin2_code",
+      "admin3_code",
+      "admin4_code",
+      "population",
+      "elevation",
+      "dem ",
+      "timezone",
+      "modification_date"};
+
+    List<String> fields = Arrays.asList(fieldStrings);
+    int counter = 0;
+    System.out.println("reading gazetteer data from file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+      String[] values = line.split(type.getSeparator());
+
+      Document doc = new Document();
+      String admincode = values[10].toLowerCase();
+      String ccode = values[8].toLowerCase();
+      if (ccode.contains(",")) {
+        String[] codes = ccode.split(",");
+        if (codes.length > 0) {
+          ccode = codes[0];
+        }
+      }
+      AdminBoundary adm = adms.get(ccode + "." + admincode);
+
+      String placeName = values[2];
+      String lat = values[4];
+      String lon = values[5];
+      String dsg = values[7];
+      String id = values[0];
+      String concatIndexEntry = "";
+      if (adm != null) {
+        concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;
+      } else {
+        //there is no admin info, but we can still use the countrycode to concat the country name
+        String n = countrycodes.get(ccode);
+        if (n != null) {
+          concatIndexEntry = n + ", " + placeName;
+        } else {
+          ///don't want a single token hierarchy entry.
+          concatIndexEntry = "";
+        }
+      }
+      if (ccode == null) {
+        System.out.println("naughty country code");
+      }
+      for (int i = 0; i < fields.size() - 1; i++) {
+        doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+
+      }
+
+      /**
+       * add standard fields to the index
+       */
+      doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
+      doc.add(new TextField("placename", placeName, Field.Store.YES));
+      doc.add(new TextField("latitude", lat, Field.Store.YES));
+      doc.add(new TextField("longitude", lon, Field.Store.YES));
+      doc.add(new TextField("loctype", dsg, Field.Store.YES));
+      doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
+      doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));
+      doc.add(new TextField("countycode", "", Field.Store.YES));
+
+      doc.add(new TextField("locid", id, Field.Store.YES));
+      doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
+      w.addDocument(doc);
+
+      counter++;
+      if (counter % 100000 == 0) {
+        w.commit();
+        System.out.println(counter + " .........Geonames entries committed to index..............");
+      }
+
+    }
+
+    System.out.println("Completed indexing gaz! index name is: " + type.toString());
+  }
+
+}

Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class RegionProcessor {
+
+  public static void main(String[] args) {
+    RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
+  }
+
+  /**
+   *
+   * @param regionsFile the file that stores Region references. the format of
+   * this file is tab delimitted text with index 0 as the name of the region,
+   * index 1 as the longitude, and index 2 as the latitude
+   * @param outputCountryContextfile this is the country context files shared by
+   * all indexing processors
+   * @param w
+   */
+  public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
+    try {
+      readFile(regionsFile, outputCountryContextfile, w);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+  }
+
+  public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {
+    List<String> ccfileentries = new ArrayList<>();
+    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+    List<String> fields = new ArrayList<>();
+    int counter = 0;
+    System.out.println("reading gazetteer data from Regions file...........");
+    String line = "";
+    while ((line = reader.readLine()) != null) {
+
+      String[] values = line.split("\t");
+      if (counter == 0) {
+
+      } else {
+        Document doc = new Document();
+        for (int i = 0; i < fields.size() - 1; i++) {
+          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+        }
+        String placeName = values[0];
+        String lat = values[2];
+        String lon = values[1];
+        String dsg = "region";
+        String id = "rg" + counter;
+
+        String hierarchy = placeName;
+
+        doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+        doc.add(new TextField("placename", placeName, Field.Store.YES));
+        doc.add(new TextField("latitude", lat, Field.Store.YES));
+        doc.add(new TextField("longitude", lon, Field.Store.YES));
+        doc.add(new TextField("loctype", dsg, Field.Store.YES));
+        doc.add(new TextField("admincode", "", Field.Store.YES));
+        doc.add(new TextField("countrycode", id, Field.Store.YES));
+        doc.add(new TextField("countycode", "", Field.Store.YES));
+
+        doc.add(new TextField("locid", id, Field.Store.YES));
+        doc.add(new TextField("gazsource", "region", Field.Store.YES));
+        //countrycontext file format
+        // US	KY	131	United States	Kentucky	Leslie
+
+        ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");
+        if (w != null) {
+          w.addDocument(doc);
+        }
+      }
+      counter++;
+
+    }
+    if (w != null) {
+      w.commit();
+    }
+    FileWriter writer = new FileWriter(outputCountryContextfile, true);
+    for (String string : ccfileentries) {
+      writer.write(string);
+    }
+    System.out.println("successfully wrote Region entries to country oontext file");
+    writer.close();
+    System.out.println("Completed indexing regions!");
+  }
+
+}