You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/07/11 03:04:59 UTC
svn commit: r1609600 [1/2] - in
/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker:
./ indexing/ scoring/
Author: markg
Date: Fri Jul 11 01:04:58 2014
New Revision: 1609600
URL: http://svn.apache.org/r1609600
Log:
OPENNLP-706
OPENNLP-707
OPENNLP-708
OPENNLP-709
OPENNLP-710
Addressed each ticket. Also adjusted the package structure a bit to separate responsibility better.
Added:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java
- copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java
- copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
- copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/PointClustering.java
- copied, changed from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ProvinceProximityScorer.java
- copied, changed from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
Removed:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/PointClustering.java
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundary.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.Objects;
+
+/**
+ * Stores an admin boundary down to the US county level. Only US places from the
+ * USGS Gazetteer will have county level info
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundary {
+
+ private static final String NO_DATA_FOUND_VALUE = "NO_DATA_FOUND";
+ private final String countryCode;
+ private final String provinceCode;
+ private final String provinceName;
+ private final String countryName;
+ private final String countyName;
+ private final String countyCode;
+
+ public AdminBoundary(String cc, String ac, String pname, String countryName) {
+ this.countryCode = cc;
+ this.provinceCode = ac;
+ this.provinceName = pname;
+ this.countryName = countryName;
+ this.countyCode = NO_DATA_FOUND_VALUE;
+ this.countyName = NO_DATA_FOUND_VALUE;
+ }
+
+ public AdminBoundary(String countryCode, String countryName, String provinceCode, String provinceName, String countyCode, String countyName) {
+ this.countryCode = countryCode;
+ this.provinceCode = provinceCode;
+ this.provinceName = provinceName;
+ this.countryName = countryName;
+ this.countyName = countyName.equals("") ? NO_DATA_FOUND_VALUE : countyName;
+ this.countyCode = countyCode.equals("") ? NO_DATA_FOUND_VALUE : countyCode;
+ }
+
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public String getProvCode() {
+ return provinceCode;
+ }
+
+ public String getProvinceName() {
+ return provinceName;
+ }
+
+ public String getCountryName() {
+ return countryName;
+ }
+
+ public String getCountyName() {
+ return countyName;
+ }
+
+ public String getCountyCode() {
+ return countyCode;
+ }
+
+ @Override
+ public String toString() {
+ return "AdminBoundary{" + "countryCode=" + countryCode + ", provinceCode=" + provinceCode + ", provinceName=" + provinceName + ", countryName=" + countryName + ", countyName=" + countyName + ", countyCode=" + countyCode + '}';
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 11 * hash + Objects.hashCode(this.countryCode);
+ hash = 11 * hash + Objects.hashCode(this.provinceCode);
+ hash = 11 * hash + Objects.hashCode(this.provinceName);
+ hash = 11 * hash + Objects.hashCode(this.countryName);
+ hash = 11 * hash + Objects.hashCode(this.countyName);
+ hash = 11 * hash + Objects.hashCode(this.countyCode);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final AdminBoundary other = (AdminBoundary) obj;
+ if (!Objects.equals(this.countryCode, other.countryCode)) {
+ return false;
+ }
+ if (!Objects.equals(this.provinceCode, other.provinceCode)) {
+ return false;
+ }
+ if (!Objects.equals(this.provinceName, other.provinceName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countryName, other.countryName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countyName, other.countyName)) {
+ return false;
+ }
+ if (!Objects.equals(this.countyCode, other.countyCode)) {
+ return false;
+ }
+ return true;
+ }
+
+}
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContext.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class AdminBoundaryContext {
+
+ private final Map<String, Set<Integer>> countryMentions;
+ private final Map<String, Set<Integer>> provMentions;
+ private final Map<String, Set<Integer>> countyMentions;
+ private final Set<String> countryHits;
+ private final Set<String> provHits;
+ private final Set<String> countyHits;
+ private final Map<String, String> countryRefMap;
+ private final Map<String, Map<String, String>> provRefMap;
+ private final Map<String, Map<String, String>> countyRefMap;
+ private final Set<String> whereClauses;
+ private final Map<String, Set<String>> nameCodesMap;
+
+ public AdminBoundaryContext(Map<String, Set<Integer>> countryMentions,
+ Map<String, Set<Integer>> provMentions,
+ Map<String, Set<Integer>> countyMentions,
+ Set<String> countryHits,
+ Set<String> provHits,
+ Set<String> countyHits,
+ Map<String, String> countryRefMap,
+ Map<String, Map<String, String>> provRefMap,
+ Map<String, Map<String, String>> countyRefMap, Map<String, Set<String>> nameCodesMap) {
+ this.countryMentions = countryMentions;
+ this.provMentions = provMentions;
+ this.countyMentions = countyMentions;
+ this.countryHits = countryHits;
+ this.provHits = provHits;
+ this.countyHits = countyHits;
+ this.countryRefMap = countryRefMap;
+ this.provRefMap = provRefMap;
+ this.countyRefMap = countyRefMap;
+ this.whereClauses = setWhereClauses();
+ this.nameCodesMap = nameCodesMap;
+ }
+
+ public Map<String, Set<String>> getNameCodesMap() {
+ return nameCodesMap;
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ public Map<String, Set<Integer>> getProvMentions() {
+ return provMentions;
+ }
+
+ public Map<String, Set<Integer>> getCountyMentions() {
+ return countyMentions;
+ }
+
+ public Set<String> getCountryHits() {
+ return countryHits;
+ }
+
+ public Set<String> getProvHits() {
+ return provHits;
+ }
+
+ public Set<String> getCountyHits() {
+ return countyHits;
+ }
+
+ public Map<String, String> getCountryRefMap() {
+ return countryRefMap;
+ }
+
+ public Map<String, Map<String, String>> getProvRefMap() {
+ return provRefMap;
+ }
+
+ public Map<String, Map<String, String>> getCountyRefMap() {
+ return countyRefMap;
+ }
+
+ public Set<String> getWhereClauses() {
+ return whereClauses;
+ }
+
+ private Set<String> setWhereClauses() {
+ Set<String> clauses = new HashSet<>();
+ for (String countryCode : this.getCountryHits()) {
+ String gazType = countryCode.toLowerCase().equals("us") ? " AND gazsource:usgs" : " AND gazsource:geonames";
+ if (countryCode.toLowerCase().matches(".*rg[0-9].*")) {
+ gazType = " AND gazsource:region";
+ }
+ Map<String, String> provsForCountry = this.getProvRefMap().get(countryCode);
+ if (provsForCountry == null) {
+ provsForCountry = new HashMap<>();
+ }
+ Map<String, String> provs = new HashMap<>();
+
+ if (!provsForCountry.isEmpty()) {
+ for (String pcode : provsForCountry.keySet()) {
+ if (this.getProvHits().contains(pcode)) {
+ provs.put(pcode, provsForCountry.get(pcode));
+
+ clauses.add(" countrycode:" + countryCode + " AND admincode:" + pcode + gazType);
+
+ }
+ }
+ }
+ if (provs.isEmpty()) {
+ //got a country with no mentioned provs
+ clauses.add(" countrycode:" + countryCode + gazType);
+ }
+ }
+ return clauses;
+ }
+
+}
Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/AdminBoundaryContextGenerator.java Fri Jul 11 01:04:58 2014
@@ -25,6 +25,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -36,25 +37,61 @@ import org.apache.log4j.Logger;
* Used to boost or degrade scoring of linked geo entities
*
*/
-public class CountryContext {
+public class AdminBoundaryContextGenerator {
- private static final Logger LOGGER = Logger.getLogger(CountryContext.class);
+ private static final Logger LOGGER = Logger.getLogger(AdminBoundaryContextGenerator.class);
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<>();
private Map<String, Set<Integer>> countryMentions = new HashMap<>();
private Set<CountryContextEntry> countryHits = new HashSet<>();
private EntityLinkerProperties properties;
-
- public CountryContext(EntityLinkerProperties properties) throws Exception {
+ private List<AdminBoundary> adminBoundaryData;
+ private Set<AdminBoundary> adminBoundaryHits = new HashSet<>();
+ private AdminBoundaryContext context;
+
+ public AdminBoundaryContext getContext(String text) {
+ context = null;
+ nameCodesMap.clear();
+ context = process(text);
+
+ return context;
+ }
+
+ private Set<String> countryHitSet = new HashSet<>();
+ private Map<String, String> countryMap = new HashMap<>();
+ private Map<String, Map<String, String>> provMap = new HashMap<>();
+ private Map<String, Map<String, String>> countyMap = new HashMap<>();
+
+ private Map<String, Set<Integer>> provMentions = new HashMap<>();
+ private Map<String, Set<Integer>> countyMentions = new HashMap<>();
+
+ private Set<String> provHits = new HashSet<String>();
+ private Set<String> countyHits = new HashSet<String>();
+
+ public static void main(String[] args) {
+ try {
+ AdminBoundaryContextGenerator countryContext = new AdminBoundaryContextGenerator(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+ GeoEntityLinker linker = new GeoEntityLinker();
+ linker.init(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties")));
+
+ countryContext.process("This artcle is about fairfax county virginia in the north of florida in the united states. It is also about Moscow and atlanta. Hillsborough county florida is a shithole. Eastern Africa people are cool.");
+
+ } catch (Exception ex) {
+ java.util.logging.Logger.getLogger(AdminBoundaryContextGenerator.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ public AdminBoundaryContextGenerator(EntityLinkerProperties properties) throws Exception {
this.properties = properties;
if (countrydata == null) {
String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-
+
File countryContextFile = new File(path);
- countrydata = getCountryContextFromFile(countryContextFile);
+ //countrydata = getCountryContextFromFile(countryContextFile);
+ adminBoundaryData = getContextFromFile(countryContextFile);
}
}
-
+
public Map<String, Set<Integer>> getCountryMentions() {
return countryMentions;
}
@@ -76,11 +113,95 @@ public class CountryContext {
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
-
+
public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
this.nameCodesMap = nameCodesMap;
}
+ private void reset() {
+ this.nameCodesMap.clear();
+ this.countryHitSet.clear();
+ this.countryHits.clear();
+ this.countryMentions.clear();
+ this.provHits.clear();
+ this.provMentions.clear();
+ this.countyHits.clear();
+ this.countyMentions.clear();
+ this.adminBoundaryHits.clear();
+ }
+
+ /**
+ * Finds indicators of countries, provinces, and cities, as per the USGS and
+ * Geonames gazetteers. The results of this are used to score toponymns
+ * downstream. The full text of a document should be passed in here.
+ *
+ * @param text the full text of the document (block of text).
+ * @return
+ */
+ private AdminBoundaryContext process(String text) {
+ try {
+ if (text.contains("Convoy of terror")) {
+ System.out.println("");
+ }
+ reset();
+ Map<String, Set<Integer>> countryhitMap = regexfind(text, countryMap, countryHitSet);
+ if (!countryhitMap.isEmpty()) {
+ for (String cc : countryhitMap.keySet()) {
+ Map<String, String> provsForCc = provMap.get(cc);
+ if (provsForCc != null) {
+ provMentions = regexfind(text, provsForCc, provHits);
+ if (provMentions != null) {
+ for (String prov : provMentions.keySet()) {
+ Map<String, String> get = countyMap.get(prov);
+ if (get != null) {
+ countyMentions = regexfind(text, get, countyHits);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ for (Map<String, String> provsForCc : provMap.values()) {
+ if (provsForCc != null) {
+ provMentions = regexfind(text, provsForCc, provHits);
+ if (provMentions != null) {
+ for (String prov : provMentions.keySet()) {
+ //fake a country hit based on a province hit... this gets fuzzy
+ String cc = prov.split("\\.")[0];
+ if (!countryhitMap.containsKey(cc)) {
+ countryhitMap.put(cc, provMentions.get(prov));
+ countryHitSet.add(cc);
+ } else {
+ countryhitMap.get(cc).addAll(provMentions.get(prov));
+ }
+ Map<String, String> get = countyMap.get(prov);
+ if (get != null) {
+ countyMentions = regexfind(text, get, countyHits);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ Map<String, String> countryRefMap = new HashMap<>();
+
+ for (String c : countryHitSet) {
+ String countryName = countryMap.get(c);
+ if (countryName != null) {
+ countryRefMap.put(c, countryName);
+ }
+ }
+
+ AdminBoundaryContext context = new AdminBoundaryContext(countryhitMap, provMentions, countyMentions, countryHitSet, provHits, countyHits, countryRefMap, provMap, countyMap, nameCodesMap);
+
+ return context;
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ return null;
+ }
+
/**
* Finds mentions of countries to assist in toponym resolution. Countries are
* discovered via regex based on a configured file called
@@ -92,19 +213,19 @@ public class CountryContext {
* list from the file.
*
* @param docText the full text of the document
- * @param properties EntityLinkerProperties for getting database connection
* @return
*/
+ @Deprecated
public Map<String, Set<Integer>> regexfind(String docText) {
countryMentions = new HashMap<>();
nameCodesMap.clear();
try {
-
+
for (CountryContextEntry entry : countrydata) {
Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
String code = entry.getCc1().toLowerCase();
-
+
boolean found = false;
while (rs.find()) {
found = true;
@@ -130,45 +251,156 @@ public class CountryContext {
if (found) {
countryHits.add(entry);
}
-
+
}
-
+
} catch (Exception ex) {
LOGGER.error(ex);
}
-
+
return countryMentions;
}
-
- private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {
- List<CountryContextEntry> entries = new ArrayList<>();
- String path = countryContextFile.getPath();
+
+ /**
+ * discovers indicators of admin boundary data using regex.
+ *
+ * @param docText the full text
+ * @param lookupMap a map to use to find names. the key=a location code, the
+ * value is an actual name.
+ * @param hitsRef a reference to a set that stores the hits by id
+ * @return
+ */
+ private Map<String, Set<Integer>> regexfind(String docText, Map<String, String> lookupMap, Set<String> hitsRef) {
+ Map<String, Set<Integer>> mentions = new HashMap<>();
+ if (lookupMap == null) {
+ return mentions;
+ }
+ try {
+
+ for (String entry : lookupMap.keySet()) {
+ String name = lookupMap.get(entry).toLowerCase();
+ if (name == null) {
+ continue;
+ }
+ name = "[^\\p{L}\\p{Nd}]" + name.replace(", the", "") + "[^\\p{L}\\p{Nd}]";
+ Pattern regex = Pattern.compile(name, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Matcher rs = regex.matcher(docText);
+ String code = entry.toLowerCase();
+
+ boolean found = false;
+ while (rs.find()) {
+ found = true;
+ Integer start = rs.start();
+ String hit = rs.group().toLowerCase().trim();
+ hit = hit.replaceAll("\\.|,|;|\\?|!|\\\\|/|\"|'|=|-|&", "");
+ if (mentions.containsKey(code)) {
+ mentions.get(code).add(start);
+ } else {
+ Set<Integer> newset = new HashSet<Integer>();
+ newset.add(start);
+ mentions.put(code, newset);
+ }
+ if (!hit.equals("")) {
+ if (this.nameCodesMap.containsKey(hit)) {
+ nameCodesMap.get(hit).add(code);
+ } else {
+ HashSet<String> newset = new HashSet<String>();
+ newset.add(code);
+ nameCodesMap.put(hit, newset);
+ }
+ }
+
+ }
+ if (found) {
+ hitsRef.add(code);
+
+ }
+ }
+
+ } catch (Exception ex) {
+ LOGGER.error(ex);
+ ex.printStackTrace();
+
+ }
+
+ return mentions;
+ }
+
+ private List<AdminBoundary> getContextFromFile(File countryContextFile) {
+ if (this.adminBoundaryData != null && !this.adminBoundaryData.isEmpty()) {
+ return adminBoundaryData;
+ }
+ List<AdminBoundary> entries = new ArrayList<>();
BufferedReader reader;
-
try {
- path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
-
- reader = new BufferedReader(new FileReader(path));
-
- while (reader.read() != -1) {
- String line = reader.readLine();
+ reader = new BufferedReader(new FileReader(countryContextFile));
+ String line = "";
+ while ((line = reader.readLine()) != null) {
String[] values = line.split("\t");
- if (values.length != 4) {
- throw new IOException("improperly formatted country context file");
+ int len = values.length;
+ if (len < 5 || len > 6) {
+ throw new IllegalArgumentException("Improperly formatted file");
+ }
+ if (values.length == 6) {
+ AdminBoundary entry = new AdminBoundary(
+ values[0].toLowerCase().trim(),
+ values[3].toLowerCase().trim(),
+ values[1].toLowerCase().trim(),
+ values[4].toLowerCase().trim(),
+ values[2].toLowerCase().trim(),
+ values[5].toLowerCase().trim());
+ entries.add(entry);
+ } else {
+ AdminBoundary entry = new AdminBoundary(
+ values[0].toLowerCase().trim(),
+ values[3].toLowerCase().trim(),
+ values[1].toLowerCase().trim(),
+ values[4].toLowerCase().trim(),
+ values[2].toLowerCase().trim(),
+ "");
+ entries.add(entry);
}
- CountryContextEntry entry = new CountryContextEntry();
- // rc,cc1, full_name_nd_ro,dsg
- entry.setRc(values[0].toLowerCase());
- entry.setCc1(values[1].toLowerCase());
- entry.setFull_name_nd_ro(values[2].toLowerCase());
- entry.setDsg(values[3].toLowerCase());
- entries.add(entry);
+
}
reader.close();
} catch (IOException ex) {
LOGGER.error(ex);
}
+ loadMaps(entries);
return entries;
-
+
}
+
+ private void loadMaps(List<AdminBoundary> boundaries) {
+ for (AdminBoundary adm : boundaries) {
+ if (!adm.getCountryCode().equals("null")) {
+ countryMap.put(adm.getCountryCode(), adm.getCountryName());
+
+ if (!adm.getProvCode().equals("null")) {
+ Map<String, String> provs = provMap.get(adm.getCountryCode());
+ if (provs == null) {
+ provs = new HashMap<>();
+ }
+ //if (!provs.containsKey(adm.getProvCode())) {
+ provs.put(adm.getCountryCode() + "." + adm.getProvCode(), adm.getProvinceName());
+ provMap.put(adm.getCountryCode(), provs);
+ // }
+
+ if (!adm.getCountyCode().toLowerCase().equals("no_data_found") && !adm.getCountyName().toLowerCase().equals("no_data_found")) {
+ Map<String, String> counties = countyMap.get(adm.getCountryCode() + "." + adm.getProvCode());
+ if (counties == null) {
+ counties = new HashMap<>();
+ } // if (!counties.containsKey(adm.getCountyCode())) {
+ String countyid = adm.getCountryCode() + "." + adm.getProvCode() + "." + adm.getCountyCode();
+ counties.put(countyid, adm.getCountyName());
+ countyMap.put(adm.getCountryCode() + "." + adm.getProvCode(), counties);
+ // }
+
+ }
+
+ }
+ }
+ }
+ }
+
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java Fri Jul 11 01:04:58 2014
@@ -31,6 +31,9 @@ public class GazetteerEntry extends Base
private String source;
private String indexID;
private Map<String, String> indexData = new HashMap<>();
+ private String countryCode;
+ private String provinceCode;
+ private String hierarchy;
/**
* returns the id from the lucene document
@@ -159,5 +162,28 @@ public class GazetteerEntry extends Base
return true;
}
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public void setCountryCode(String countryCode) {
+ this.countryCode = countryCode;
+ }
+
+ public String getProvinceCode() {
+ return provinceCode;
+ }
+
+ public void setProvinceCode(String provinceCode) {
+ this.provinceCode = provinceCode;
+ }
+
+ public String getHierarchy() {
+ return hierarchy;
+ }
+
+ public void setHierarchy(String hierarchy) {
+ this.hierarchy = hierarchy;
+ }
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearchCache.java Fri Jul 11 01:04:58 2014
@@ -28,7 +28,7 @@ public class GazetteerSearchCache {
private static Map<String, ArrayList<GazetteerEntry>> gazCache = new HashMap<>();
/**
- * returns the cached entries. Returns null if the query does not exists in the cache
+ * returns the cached entries. Returns null if the query does not exist in the cache
* @param searchString
* @return
*/
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Fri Jul 11 01:04:58 2014
@@ -38,6 +38,7 @@ import org.apache.lucene.util.Version;
import opennlp.tools.entitylinker.EntityLinkerProperties;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.search.Sort;
/**
*
@@ -63,11 +64,16 @@ public class GazetteerSearcher {
private Analyzer usgsAnalyzer;
private EntityLinkerProperties properties;
+ private Directory opennlpIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader opennlpReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher opennlpSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer opennlpAnalyzer;
+
public static void main(String[] args) {
try {
boolean b = Boolean.valueOf("true");
- new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("townsville, queensland", 5, "");
+ new GazetteerSearcher(new EntityLinkerProperties(new File("c:\\temp\\entitylinker.properties"))).geonamesFind("baghdad", 5, "iz");
} catch (IOException ex) {
java.util.logging.Logger.getLogger(GazetteerSearcher.class.getName()).log(Level.SEVERE, null, ex);
} catch (Exception ex) {
@@ -79,6 +85,112 @@ public class GazetteerSearcher {
this.properties = properties;
init();
}
+/**
+ * Searches the single lucene index that includes the location hierarchy.
+ * @param searchString the location name to search for
+ * @param rowsReturned how many index entries to return (top N...)
+ * @param whereClause the conditional statement that defines the index type and the country oode.
+ * @return
+ */
+ public ArrayList<GazetteerEntry> find(String searchString, int rowsReturned, String whereClause) {
+ ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
+ searchString = cleanInput(searchString);
+ if (searchString.isEmpty()) {
+ return linkedData;
+ }
+ try {
+ /**
+ * build the search string Sometimes no country context is found. In this
+ * case the code variables will be empty strings
+ */
+ String placeNameQueryString = "placename:(" + searchString.toLowerCase() + ") AND " + whereClause;
+ if (searchString.trim().contains(" ")) {
+ placeNameQueryString = "(placename:(" + searchString.toLowerCase() + ") AND hierarchy:(" + formatForHierarchy(searchString) + "))"
+ + " AND " + whereClause;
+ }
+
+ // luceneQueryString = "hierarchy:(tampa florida) AND gazsource:usgs";
+ /**
+ * check the cache and go no further if the records already exist
+ */
+ ArrayList<GazetteerEntry> get = GazetteerSearchCache.get(placeNameQueryString);
+ if (get != null) {
+
+ return get;
+ }
+ /**
+ * search the placename
+ */
+ QueryParser parser = new QueryParser(Version.LUCENE_48, placeNameQueryString, opennlpAnalyzer);
+ Query q = parser.parse(placeNameQueryString);
+
+ TopDocs bestDocs = opennlpSearcher.search(q, rowsReturned, Sort.RELEVANCE);
+
+ for (int i = 0; i < bestDocs.scoreDocs.length; ++i) {
+ GazetteerEntry entry = new GazetteerEntry();
+ int docId = bestDocs.scoreDocs[i].doc;
+ double sc = bestDocs.scoreDocs[i].score;
+
+ entry.getScoreMap().put("lucene", sc);
+ entry.setIndexID(docId + "");
+
+ Document d = opennlpSearcher.doc(docId);
+
+ List<IndexableField> fields = d.getFields();
+
+ String lat = d.get("latitude");
+ String lon = d.get("longitude");
+ String placename = d.get("placename");
+ String parentid = d.get("countrycode").toLowerCase();
+ String provid = d.get("admincode");
+ String itemtype = d.get("loctype");
+ String source = d.get("gazsource");
+ String hier = d.get("hierarchy");
+ entry.setSource(source);
+
+ entry.setItemID(docId + "");
+ entry.setLatitude(Double.valueOf(lat));
+ entry.setLongitude(Double.valueOf(lon));
+ entry.setItemType(itemtype);
+ entry.setItemParentID(parentid);
+ entry.setProvinceCode(provid);
+ entry.setCountryCode(parentid);
+ entry.setItemName(placename);
+ entry.setHierarchy(hier);
+ for (int idx = 0; idx < fields.size(); idx++) {
+ entry.getIndexData().put(fields.get(idx).name(), d.get(fields.get(idx).name()));
+ }
+ /**
+ * norm the levenstein distance
+ */
+ int maxLen = searchString.length() > entry.getItemName().length() ? searchString.length() : entry.getItemName().length();
+
+ Double normLev = Math.abs(1-(sc / (double) maxLen));//searchString.length() / (double) entry.getItemName().length();
+ /**
+ * only want hits above the levenstein thresh. This should be a low
+ * thresh due to the use of the hierarchy field in the index
+ */
+ if (normLev.compareTo(scoreCutoff) >= 0) {
+// if (entry.getItemParentID().toLowerCase().equals(parentid.toLowerCase()) || parentid.toLowerCase().equals("")) {
+ entry.getScoreMap().put("normlucene", normLev);
+ //make sure we don't produce a duplicate
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ /**
+ * add the records to the cache for this query
+ */
+ GazetteerSearchCache.put(placeNameQueryString, linkedData);
+ }
+// }
+ }
+ }
+
+ } catch (IOException | ParseException ex) {
+ LOGGER.error(ex);
+ }
+
+ return linkedData;
+ }
/**
*
@@ -88,6 +200,7 @@ public class GazetteerSearcher {
*
* @return
*/
+ @Deprecated
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
@@ -198,6 +311,7 @@ public class GazetteerSearcher {
*
* @return
*/
+ @Deprecated
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
searchString = cleanInput(searchString);
@@ -284,7 +398,8 @@ public class GazetteerSearcher {
}
/**
- * Replaces any noise chars with a space, and depending on configuration adds double quotes to the string
+ * Replaces any noise chars with a space, and depending on configuration adds
+ * double quotes to the string
*
* @param input
* @return
@@ -300,36 +415,66 @@ public class GazetteerSearcher {
}
private void init() throws Exception {
- if (usgsIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+// if (usgsIndex == null) {
+// String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+// if (indexloc.equals("")) {
+// // System.out.println("USGS Gaz location not found");
+// LOGGER.error(new Exception("USGS Gaz location not found"));
+// }
+// String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+//
+// scoreCutoff = Double.valueOf(cutoff);
+// String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
+// doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
+// usgsIndex = new MMapDirectory(new File(indexloc));
+// usgsReader = DirectoryReader.open(usgsIndex);
+// usgsSearcher = new IndexSearcher(usgsReader);
+// usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+// }
+// if (geonamesIndex == null) {
+// String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+// if (indexloc.equals("")) {
+// LOGGER.error(new Exception("Geonames Gaz location not found"));
+//
+// }
+// String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+// scoreCutoff = Double.valueOf(cutoff);
+// geonamesIndex = new MMapDirectory(new File(indexloc));
+// geonamesReader = DirectoryReader.open(geonamesIndex);
+// geonamesSearcher = new IndexSearcher(geonamesReader);
+// //TODO: a language code switch statement should be employed here at some point
+// geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+//
+// }
+ if (opennlpIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz", "");
if (indexloc.equals("")) {
- // System.out.println("USGS Gaz location not found");
- LOGGER.error(new Exception("USGS Gaz location not found"));
+ LOGGER.error(new Exception("Opennlp combined Gaz directory location not found"));
+
}
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+ // String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+ // scoreCutoff = Double.valueOf(cutoff);
+ opennlpIndex = new MMapDirectory(new File(indexloc));
+ opennlpReader = DirectoryReader.open(opennlpIndex);
+ opennlpSearcher = new IndexSearcher(opennlpReader);
+ //TODO: a language code switch statement should be employed here at some point
+ opennlpAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
- scoreCutoff = Double.valueOf(cutoff);
- String doubleQuote = properties.getProperty("opennlp.geoentitylinker.gaz.doublequote", String.valueOf(doubleQuoteAllSearchTerms));
- doubleQuoteAllSearchTerms = Boolean.valueOf(doubleQuote);
- usgsIndex = new MMapDirectory(new File(indexloc));
- usgsReader = DirectoryReader.open(usgsIndex);
- usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
}
- if (geonamesIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- if (indexloc.equals("")) {
- LOGGER.error(new Exception("Geonames Gaz location not found"));
+ }
+ private String formatForHierarchy(String searchTerm) {
+ String[] parts = searchTerm.split(" ");
+ String out = "";
+ if (parts.length != 0) {
+ for (String string : parts) {
+ out += string + " AND ";
}
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
- scoreCutoff = Double.valueOf(cutoff);
- geonamesIndex = new MMapDirectory(new File(indexloc));
- geonamesReader = DirectoryReader.open(geonamesIndex);
- geonamesSearcher = new IndexSearcher(geonamesReader);
- //TODO: a language code switch statement should be employed here at some point
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
-
+ out = out.substring(0, out.lastIndexOf(" AND "));
+ } else {
+ out = cleanInput(searchTerm);
}
+ return out;
}
+
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1609600&r1=1609599&r2=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Fri Jul 11 01:04:58 2014
@@ -15,6 +15,11 @@
*/
package opennlp.addons.geoentitylinker;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
+import opennlp.addons.geoentitylinker.scoring.LinkedEntityScorer;
+import opennlp.addons.geoentitylinker.scoring.CountryProximityScorer;
+import opennlp.addons.geoentitylinker.scoring.GeoHashBinningScorer;
+import opennlp.addons.geoentitylinker.scoring.FuzzyStringMatchScorer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
@@ -33,11 +38,11 @@ import opennlp.tools.entitylinker.Entity
*/
public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
- private CountryContext countryContext;
+ private AdminBoundaryContextGenerator countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
- private List<LinkedEntityScorer> scorers = new ArrayList<>();
+ private List<LinkedEntityScorer<AdminBoundaryContext>> scorers = new ArrayList<>();
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -46,8 +51,8 @@ public class GeoEntityLinker implements
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
- countryMentions = countryContext.regexfind(doctext);
-
+ //countryMentions = countryContext.regexfind(doctext);
+ AdminBoundaryContext context = countryContext.getContext(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
String[] tokens = tokensBySentence[s];
@@ -55,51 +60,27 @@ public class GeoEntityLinker implements
for (int i = 0; i < matches.length; i++) {
- /**
- * nga gazateer is for other than US placenames,don't want to use it if
- * US is the only country mentioned in the doc
- *
- */
ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
- if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
- || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
-
- if (!countryMentions.keySet().isEmpty()) {
- for (String code : countryMentions.keySet()) {
- if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
- }
- }
- } else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
-
+ if (!context.getWhereClauses().isEmpty()) {
+ for (String whereclause : context.getWhereClauses()) {
+ geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, whereclause));
}
-
- }
- ArrayList<BaseLink> usgsEntries = new ArrayList<>();
- if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
- //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
- usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));
- }
- LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd(), "location",names[i].getProb());
-
-
- if (!usgsEntries.isEmpty()) {
- geoSpan.getLinkedEntries().addAll(usgsEntries);
- geoSpan.setSearchTerm(matches[i]);
- }
-
- if (!geoSpan.getLinkedEntries().isEmpty()) {
- geoSpan.setSearchTerm(matches[i]);
- geoSpan.setSentenceid(s);
- spans.add(geoSpan);
+ }else{//this means there were no where clauses generated so the where clause will default to look at the entire index
+ geoNamesEntries.addAll(gazateerSearcher.find(matches[i], 3, " gaztype:* "));
}
+ //start generating queries
+ LinkedSpan newspan = new LinkedSpan(geoNamesEntries, names[i], 0);
+ newspan.setSearchTerm(matches[i]);
+ newspan.setLinkedEntries(geoNamesEntries);
+ newspan.setSentenceid(s);
+ spans.add(newspan);
}
+
}
if (!scorers.isEmpty()) {
for (LinkedEntityScorer scorer : scorers) {
- scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+ scorer.score(spans, doctext, sentences, linkerProperties, context);
}
}
@@ -111,6 +92,8 @@ public class GeoEntityLinker implements
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
+ scorers.add(new FuzzyStringMatchScorer());
+ // scorers.add(new ProvinceProximityScorer());
}
}
@@ -118,7 +101,7 @@ public class GeoEntityLinker implements
public void init(EntityLinkerProperties properties) {
try {
this.linkerProperties = properties;
- countryContext = new CountryContext(this.linkerProperties);
+ countryContext = new AdminBoundaryContextGenerator(this.linkerProperties);
gazateerSearcher = new GazetteerSearcher(this.linkerProperties);
loadScorers();
} catch (Exception ex) {
Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java (from r1594067, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java&r1=1594067&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java Fri Jul 11 01:04:58 2014
@@ -13,20 +13,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.indexing;
-import java.io.BufferedReader;
import java.io.File;
-import java.io.FileReader;
+import java.io.FileNotFoundException;
import java.util.ArrayList;
-import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
@@ -39,11 +34,26 @@ import org.apache.lucene.util.Version;
*/
public class GazetteerIndexer {
+ public static void main(String[] args) {
+ try {
+ GazetteerIndexer i = new GazetteerIndexer();
+ i.index(new File("C:\\temp\\gazetteers\\geonamesdata\\allcountries\\allCountries.txt"),
+ new File("C:\\temp\\gazetteers\\geonamesdata\\countryinfo.txt"),
+ new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"),
+ new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"),
+ new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"),
+ new File("C:\\temp\\gazetteers\\"),
+ new File("C:\\temp\\gazetteers\\newCountryContextFile.txt"),
+ new File("C:\\temp\\gazetteers\\regions.txt"));
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
public GazetteerIndexer() {
}
-
public static interface Separable {
String getSeparator();
@@ -76,64 +86,142 @@ public class GazetteerIndexer {
}
/**
- * indexes the USGS or Geonames gazateers.
*
- * @param outputIndexDir a DIRECTORY path where you would like to store
- * the output lucene indexes
- * @param gazetteerInputData the file, "as is" that was downloaded from the
- * USGS and GEONAMES website
- * @param type indicates whether the data is USGS or GEONAMES
- * format
+ * @param geonamesData the actual Geonames gazetteer data downloaded from
+ * here: http://download.geonames.org/export/dump/ then click on this
+ * link 'allCountries.zip'
+ * @param geoNamesCountryInfo the countryinfo lookup table that can be
+ * downloaded from here
+ * http://download.geonames.org/export/dump/countryinfo.txt
+ * @param geonamesAdmin1CodesASCII The lookup data for the province names for
+ * each place found here:
+ * http://download.geonames.org/export/dump/admin1CodesASCII.txt highlight the
+ * table view, and copy results into a text file. Make sure the tab delimitted
+ * format is maintained.
+ * @param usgsDataFile the actual USGS gazetteer downloaded from here:
+ * http://geonames.usgs.gov/domestic/download_data.htm click on the
+ * national_file####.zip link to get all the most recent features
+ *
+ * @param usgsGovUnitsFile go to here:
+ * http://geonames.usgs.gov/domestic/download_data.htm in the section titled
+ * "Topical Gazetteers -- File Format" click on the drop down list and select
+ * "Government Units". The downloaded file is what you need for this param.
+ * @param outputIndexDir where you want the final index. Must be a directory,
+ * not an actual file.
+ * @param outputCountryContextFile The output countrycontext file. THis is a
+ * very important file used inside the GeoEntityLinker to assist in toponym
+ * resolution.
+ * @param regionsFile this file contains a list of regions in the following
+ * format: tab delimitted text with index 0 as the name of the region, index 1
+ * as the longitude, and index 2 as the latitude
* @throws Exception
*/
- public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
+ public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII,
+ File usgsDataFile, File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
+ if (!geonamesData.exists()) {
+ throw new FileNotFoundException("geonames data file does not exist");
+ }
+ if (!geoNamesCountryInfo.exists()) {
+ throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
+ }
+ if (!geonamesAdmin1CodesASCII.exists()) {
+ throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
+ }
- String indexloc = outputIndexDir + type.toString();
+ if (!usgsDataFile.exists()) {
+ throw new FileNotFoundException("usgsDataFile data file does not exist");
+ }
+ if (!usgsGovUnitsFile.exists()) {
+ throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
+ }
+ if (!outputIndexDir.exists()) {
+ throw new FileNotFoundException("outputIndexDir data file does not exist");
+ }
+ if (!regionsFile.exists()) {
+ throw new FileNotFoundException("regionsFile data file does not exist");
+ }
+
+ String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
Directory index = new MMapDirectory(new File(indexloc));
Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
IndexWriter w = new IndexWriter(index, config);
+ USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);
- readFile(gazetteerInputData, w, type);
+ GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);
+
+ RegionProcessor.process(regionsFile, outputCountryContextFile, w);
w.commit();
w.close();
-
+ System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" + outputCountryContextFile.getPath() + "' to entitylinker.properties file");
}
- public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<>();
- int counter = 0;
- System.out.println("reading gazetteer data from file...........");
- while (reader.read() != -1) {
- String line = reader.readLine();
- String[] values = line.split(type.getSeparator());
- if (counter == 0) {
- for (String columnName : values) {
- fields.add(columnName.replace("»¿", "").trim());
- }
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
- }
- w.addDocument(doc);
- }
- counter++;
- if (counter % 100000 == 0) {
- w.commit();
- System.out.println(counter + " .........committed to index..............");
- }
+ /**
+ * indexes the USGS or Geonames gazateers.
+ *
+ * @param outputIndexDir a DIRECTORY path where you would like to store the
+ * output lucene indexes
+ * @param gazetteerInputData the file, "as is" that was downloaded from the
+ * USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES format
+ * @throws Exception
+ */
+ @Deprecated
+ public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
+
+ String indexloc = outputIndexDir + type.toString();
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_48, new CharArraySet(Version.LUCENE_48, new ArrayList(), true));
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+ // GeonamesProcessor.process(new File("C:\\temp\\gazetteers\\geonamesdata\\countrycodes.txt"), new File("C:\\temp\\gazetteers\\geonamesdata\\admin1CodesASCII.txt"), gazetteerInputData, null, w);
+ // USGSProcessor.process(gazetteerInputData, outputIndexDir, w);
+ // readFile(gazetteerInputData, w, type);
w.commit();
- System.out.println("Completed indexing gaz! index name is: " + type.toString());
+ w.close();
+
}
+//
+// public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
+// BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+// List<String> fields = new ArrayList<>();
+// int counter = 0;
+// System.out.println("reading gazetteer data from file...........");
+// while (reader.read() != -1) {
+// String line = reader.readLine();
+// String[] values = line.split(type.getSeparator());
+// if (counter == 0) {
+// for (String columnName : values) {
+// fields.add(columnName.replace("»¿", "").trim());
+// }
+//
+// } else {
+// Document doc = new Document();
+// for (int i = 0; i < fields.size() - 1; i++) {
+// doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+// }
+// w.addDocument(doc);
+// }
+// counter++;
+// if (counter % 100000 == 0) {
+// w.commit();
+// System.out.println(counter + " .........committed to index..............");
+// }
+//
+// }
+// w.commit();
+// System.out.println("Completed indexing gaz! index name is: " + type.toString());
+// }
}
Copied: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java (from r1585862, opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java)
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java?p2=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java&p1=opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java&r1=1585862&r2=1609600&rev=1609600&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeoEntityLinkerSetupUtils.java Fri Jul 11 01:04:58 2014
@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package opennlp.addons.geoentitylinker;
+package opennlp.addons.geoentitylinker.indexing;
import java.io.BufferedOutputStream;
import java.io.File;
@@ -28,6 +28,8 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContextGenerator;
+import opennlp.addons.geoentitylinker.scoring.ModelBasedScorer;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
@@ -41,6 +43,7 @@ import opennlp.tools.util.PlainTextByLin
*
* Tools for setting up GeoEntityLinker gazateers and doccat scoring model
*/
+@Deprecated
public class GeoEntityLinkerSetupUtils {
private static final int RADIUS = 200;
public static ModelBasedScorer scorer;
@@ -86,7 +89,7 @@ public class GeoEntityLinkerSetupUtils {
* @throws IOException
*/
public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
- CountryContext context = new CountryContext(properties);
+ AdminBoundaryContextGenerator context = new AdminBoundaryContextGenerator(properties);
FileWriter writer = new FileWriter(annotationOutFile, true);
System.out.println("processing " + documents.size() + " documents");
for (String docText : documents) {
@@ -131,7 +134,7 @@ public class GeoEntityLinkerSetupUtils {
* @param radius
* @return
*/
- private static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+ private static Map<String, ArrayList<String>> modelCountryContext(String docText, AdminBoundaryContextGenerator additionalContext, int radius) {
Map<String, ArrayList< String>> featureBags = new HashMap<>();
Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
/**
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesFileDownloader.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+/**
+ *
+ * @author mgiaconia
+ */
+import java.io.*;
+import java.net.*;
+import java.util.Enumeration;
+import java.util.zip.*;
+public class GeonamesFileDownloader {
+
+ final static int size = 1024;
+ private static final String ALL_COUNTRIES = "http://download.geonames.org/export/dump/ZM.zip";
+ private static final String COUNTRY_INFO = "";
+ private static final String ADM1_LOOKUP = "";
+
+ public static void main(String[] args) {
+ downloadGeonamesFiles(COUNTRY_INFO, "c:\\temp\\gazetteers");
+ }
+
+ public static void downloadGeonamesFiles(String outputFileName, String outputDir) {
+ String fileDownload = fileDownload(ALL_COUNTRIES, outputDir);
+
+ unzipMyZip(fileDownload, outputDir);
+
+ fileDownload(COUNTRY_INFO, outputDir);
+ fileDownload(ADM1_LOOKUP, outputDir);
+
+ }
+
+ public static final void writeFile(InputStream in, OutputStream out)
+ throws IOException {
+ byte[] buffer = new byte[1024];
+ int len;
+
+ while ((len = in.read(buffer)) != 0) {
+ out.write(buffer, 0, len);
+ }
+
+ in.close();
+ out.close();
+ }
+
+ public static void unzipMyZip(String zipFileName,
+ String directoryToExtractTo) {
+ Enumeration entriesEnum;
+ ZipFile zip;
+ try {
+ zip = new ZipFile(zipFileName);
+ entriesEnum = zip.entries();
+ while (entriesEnum.hasMoreElements()) {
+ ZipEntry entry = (ZipEntry) entriesEnum.nextElement();
+ InputStream is = zip.getInputStream(entry); // get the input stream
+ OutputStream os = new java.io.FileOutputStream(new File(zipFileName.replace("\\.zip", ".txt")));
+ byte[] buf = new byte[4096];
+ int r;
+ while ((r = is.read(buf)) != -1) {
+ os.write(buf, 0, r);
+ }
+ os.close();
+ is.close();
+ }
+ } catch (IOException ioe) {
+ System.err.println("Some Exception Occurred:");
+ ioe.printStackTrace();
+ return;
+ }
+ }
+
+ public static String fileUrl(String fAddress, String localFileName, String destinationDir) {
+ OutputStream outStream = null;
+ URLConnection uCon = null;
+ String filename = destinationDir + "\\" + localFileName;
+ InputStream is = null;
+ try {
+ URL Url;
+ byte[] buf;
+ int ByteRead, ByteWritten = 0;
+ Url = new URL(fAddress);
+ outStream = new BufferedOutputStream(new FileOutputStream(destinationDir + "\\" + localFileName));
+
+ uCon = Url.openConnection();
+ is = uCon.getInputStream();
+ buf = new byte[size];
+ while ((ByteRead = is.read(buf)) != -1) {
+ outStream.write(buf, 0, ByteRead);
+ ByteWritten += ByteRead;
+ }
+ System.out.println("Downloaded Successfully.");
+ System.out.println("File name:\"" + localFileName + "\"\nNo ofbytes :" + ByteWritten);
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ try {
+ is.close();
+ outStream.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ return filename;
+ }
+
+ public static String fileDownload(String fAddress, String destinationDir) {
+ int slashIndex = fAddress.lastIndexOf('/');
+ int periodIndex = fAddress.lastIndexOf('.');
+
+ String fileName = fAddress.substring(slashIndex + 1);
+ String retFileName = "";
+ if (periodIndex >= 1 && slashIndex >= 0
+ && slashIndex < fAddress.length() - 1) {
+ retFileName = fileUrl(fAddress, fileName, destinationDir);
+ } else {
+ System.err.println("path or file name.");
+ }
+ return retFileName;
+ }
+
+}
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class GeonamesProcessor {
+
+ public static void process(File countryCodesLookupFile, File adm1CodesLookupFile, File geonamesGazetteerFile, File outputCountryContextFile, IndexWriter w) throws Exception {
+ Map<String, String> countryCodes = getCountryCodes(countryCodesLookupFile);
+
+ Map<String, AdminBoundary> adm1s = getProvData(adm1CodesLookupFile, countryCodes);
+ // List<AdminBoundary> adm2s = getCountryContextFromFile(new File("C:\\temp\\gazetteers\\geonamesdata\\admin2Codes.txt"));
+ //admin2Codes.txt
+
+ readFile(geonamesGazetteerFile, GazetteerIndexer.GazType.GEONAMES, adm1s, countryCodes, w);
+ //now append to the coutnry context file
+ writeCountryContextFile(outputCountryContextFile, adm1s);
+
+ }
+
+ public GeonamesProcessor() {
+ }
+
+ private static Map<String, AdminBoundary> getProvData(File adm1CodesLookupFile, Map<String, String> ccodes) {
+ System.out.println("Attempting to read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+ Map<String, AdminBoundary> outmap = new HashMap<>();
+ BufferedReader reader;
+ Set<String> nullcodes = new HashSet<>();
+ try {
+
+ reader = new BufferedReader(new FileReader(adm1CodesLookupFile));
+ int i = 0;
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ // String line = reader.readLine();
+ String[] values = line.split("\t");
+ if (values.length != 4) {
+ throw new IOException("improperly formatted province lookup file");
+ }
+ String ccode = values[0].toLowerCase();
+
+ String[] split = ccode.split("\\.");
+ String pcode = "";
+ if (split.length == 2) {
+ //System.out.println(split);
+ ccode = split[0];
+ pcode = split[1];
+ }
+
+ String pname = values[2];
+
+ if (ccode.matches("[0-9].*")) {
+ String code = ccode;
+ ccode = pcode;
+ pcode = code;
+ }
+
+ String cname = ccodes.get(ccode);
+
+ if (cname == null) {
+ nullcodes.add(ccode);
+ }
+ AdminBoundary data = new AdminBoundary(ccode, pcode, pname, cname);
+ // System.out.println(data);
+ outmap.put(ccode + "." + pcode, data);
+
+ }
+ System.out.println("INFO: there were " + nullcodes.size() + " null prov codes. This is due to inconsistencies in reference data.");
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("Successfully read geonames province data from: " + adm1CodesLookupFile.getPath());
+
+ return outmap;
+
+ }
+
+ private static Map<String, String> getCountryCodes(File countryContextFile) {
+ Map<String, String> ccs = new HashMap<>();
+ BufferedReader reader;
+ try {
+
+ reader = new BufferedReader(new FileReader(countryContextFile));
+ int i = 0;
+ String line = "";
+ boolean start = false;
+ while ((line = reader.readLine()) != null) {
+ if (!line.toLowerCase().startsWith("#iso\t") && !start) {
+
+ continue;
+ } else {
+ start = true;
+ }
+ String[] values = line.split("\t");
+
+ String ccode = values[0].toLowerCase();//this is the 2 digit ISO code
+ String cname = values[4].toLowerCase();
+ if (!ccode.equals("")) {
+ ccs.put(ccode, cname);
+ }
+
+ }
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ ccs.put("SS", "South Sudan");
+ ccs.put("CS", "Kosovo");
+ return ccs;
+
+ }
+
+ public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
+ // FileWriter writer = null;
+ try (FileWriter writer = new FileWriter(outfile, true)) {
+
+ for (String admKey : adms.keySet()) {
+ AdminBoundary adm = adms.get(admKey);
+ if (adm == null) {
+ continue;
+ }
+ String province = adm.getProvinceName();
+ String country = adm.getCountryName();
+
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + "" + "\t" + country + "\t" + province + "\t" + "" + "\n";
+ writer.write(line);
+ // System.out.println(line);
+
+ }
+ writer.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("successfully wrote Geonames entries to country oontext file");
+ }
+
+ /**
+ *
+ * @param gazateerInputData the Geonames allCounties.txt file
+ * @param type the types of gaz entry, usgs, geonames, or regions
+ * @param adms the province info
+ * @param countrycodes the country code info
+ * @param w the lucene index writer
+ * @throws Exception
+ */
+ public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {
+
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ String[] fieldStrings = new String[]{
+ "geonameid",
+ "name",
+ "asciiname",
+ "alternatenames",
+ "latitude",
+ "longitude",
+ "feature_class",
+ "feature_code",
+ "country code",
+ "cc2",
+ "admin1_code",
+ "admin2_code",
+ "admin3_code",
+ "admin4_code",
+ "population",
+ "elevation",
+ "dem ",
+ "timezone",
+ "modification_date"};
+
+ List<String> fields = Arrays.asList(fieldStrings);
+ int counter = 0;
+ System.out.println("reading gazetteer data from file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+ String[] values = line.split(type.getSeparator());
+
+ Document doc = new Document();
+ String admincode = values[10].toLowerCase();
+ String ccode = values[8].toLowerCase();
+ if (ccode.contains(",")) {
+ String[] codes = ccode.split(",");
+ if (codes.length > 0) {
+ ccode = codes[0];
+ }
+ }
+ AdminBoundary adm = adms.get(ccode + "." + admincode);
+
+ String placeName = values[2];
+ String lat = values[4];
+ String lon = values[5];
+ String dsg = values[7];
+ String id = values[0];
+ String concatIndexEntry = "";
+ if (adm != null) {
+ concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;
+ } else {
+ //there is no admin info, but we can still use the countrycode to concat the country name
+ String n = countrycodes.get(ccode);
+ if (n != null) {
+ concatIndexEntry = n + ", " + placeName;
+ } else {
+ ///don't want a single token hierarchy entry.
+ concatIndexEntry = "";
+ }
+ }
+ if (ccode == null) {
+ System.out.println("naughty country code");
+ }
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+
+ }
+
+ /**
+ * add standard fields to the index
+ */
+ doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new TextField("loctype", dsg, Field.Store.YES));
+ doc.add(new TextField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countrycode", ccode.toLowerCase(), Field.Store.YES));
+ doc.add(new TextField("countycode", "", Field.Store.YES));
+
+ doc.add(new TextField("locid", id, Field.Store.YES));
+ doc.add(new TextField("gazsource", "geonames", Field.Store.YES));
+ w.addDocument(doc);
+
+ counter++;
+ if (counter % 100000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........Geonames entries committed to index..............");
+ }
+
+ }
+
+ System.out.println("Completed indexing gaz! index name is: " + type.toString());
+ }
+
+}
Added: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java?rev=1609600&view=auto
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java (added)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java Fri Jul 11 01:04:58 2014
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2014 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.addons.geoentitylinker.AdminBoundary;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+/**
+ *
+ * @author mgiaconia
+ */
+public class RegionProcessor {
+
+ public static void main(String[] args) {
+ RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
+ }
+
+ /**
+ *
+ * @param regionsFile the file that stores Region references. the format of
+ * this file is tab delimitted text with index 0 as the name of the region,
+ * index 1 as the longitude, and index 2 as the latitude
+ * @param outputCountryContextfile this is the country context files shared by
+ * all indexing processors
+ * @param w
+ */
+ public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
+ try {
+ readFile(regionsFile, outputCountryContextfile, w);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {
+ List<String> ccfileentries = new ArrayList<>();
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<>();
+ int counter = 0;
+ System.out.println("reading gazetteer data from Regions file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split("\t");
+ if (counter == 0) {
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ String placeName = values[0];
+ String lat = values[2];
+ String lon = values[1];
+ String dsg = "region";
+ String id = "rg" + counter;
+
+ String hierarchy = placeName;
+
+ doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new TextField("loctype", dsg, Field.Store.YES));
+ doc.add(new TextField("admincode", "", Field.Store.YES));
+ doc.add(new TextField("countrycode", id, Field.Store.YES));
+ doc.add(new TextField("countycode", "", Field.Store.YES));
+
+ doc.add(new TextField("locid", id, Field.Store.YES));
+ doc.add(new TextField("gazsource", "region", Field.Store.YES));
+ //countrycontext file format
+ // US KY 131 United States Kentucky Leslie
+
+ ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND_VALUE" + "\t" + "NO_DATA_FOUND_VALUE\n");
+ if (w != null) {
+ w.addDocument(doc);
+ }
+ }
+ counter++;
+
+ }
+ if (w != null) {
+ w.commit();
+ }
+ FileWriter writer = new FileWriter(outputCountryContextfile, true);
+ for (String string : ccfileentries) {
+ writer.write(string);
+ }
+ System.out.println("successfully wrote Region entries to country oontext file");
+ writer.close();
+ System.out.println("Completed indexing regions!");
+ }
+
+}