You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/12/04 13:12:43 UTC
svn commit: r1547783 -
/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/
Author: markg
Date: Wed Dec 4 12:12:43 2013
New Revision: 1547783
URL: http://svn.apache.org/r1547783
Log:
OPENNLP-614
Fixed a bug in the GeoEntityLinker. No gaz lookup was being performed if no country context was found.
Modified:
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java Wed Dec 4 12:12:43 2013
@@ -18,11 +18,6 @@ package org.apache.opennlp.addons.tools.
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
-import java.sql.CallableStatement;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.ResultSet;
-import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@@ -42,12 +37,23 @@ import opennlp.tools.entitylinker.Entity
*/
public class CountryContext {
- private Connection con;
+
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
private Set<CountryContextEntry> countryHits = new HashSet<>();
+ public CountryContext() {
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ public Set<CountryContextEntry> getCountryHits() {
+ return countryHits;
+ }
+
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
@@ -56,10 +62,6 @@ public class CountryContext {
this.nameCodesMap = nameCodesMap;
}
- public CountryContext() {
- }
-
-
/**
* Finds mentions of countries based on a list from MySQL stored procedure
* called getCountryList. This method finds country mentions in documents,
@@ -71,15 +73,13 @@ public class CountryContext {
* @return
*/
public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
- countryMentions = new HashMap<String, Set<Integer>>();
+ countryMentions = new HashMap<>();
nameCodesMap.clear();
try {
-// if (con == null) {
-// con = getMySqlConnection(properties);
-// }
+
if (countrydata == null) {
- countrydata = getCountryContextFromFile(properties);
- // countrydata = getCountryData(properties);
+ countrydata = getCountryContextFromFile(properties);
+ // countrydata = getCountryData(properties);
}
for (CountryContextEntry entry : countrydata) {
Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
@@ -122,95 +122,6 @@ public class CountryContext {
return countryMentions;
}
- /**
- * returns a unique list of country codes
- *
- * @param countryMentions the countryMentions discovered
- * @return
- */
- public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
- Set<String> ccs = new HashSet<String>();
- for (CountryContextHit hit : hits) {
- ccs.add(hit.getCountryCode().toLowerCase());
- }
- return ccs;
- }
-
- public static String getCountryCodeCSV(Set<String> hits) {
- String csv = "";
- if (hits.isEmpty()) {
- return csv;
- }
-
- for (String code : hits) {
- csv += "," + code;
- }
- return csv.substring(1);
- }
-
- private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {
-
- String driver = properties.getProperty("db.driver", "org.gjt.mm.mysql.Driver");
- String url = properties.getProperty("db.url", "jdbc:mysql://localhost:3306/world");
- String username = properties.getProperty("db.username", "root");
- String password = properties.getProperty("db.password", "?");
-
- Class.forName(driver);
- Connection conn = DriverManager.getConnection(url, username, password);
- return conn;
- }
-
- /**
- * reads the list from the database by calling a stored procedure
- * getCountryList
- *
- * @param properties
- * @return
- * @throws SQLException
- */
- private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {
- List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
- try {
- if (con == null) {
- con = getMySqlConnection(properties);
- }
- CallableStatement cs;
- cs = con.prepareCall("CALL `getCountryList`()");
- ResultSet rs;
- rs = cs.executeQuery();
- if (rs == null) {
- return entries;
- }
- while (rs.next()) {
- CountryContextEntry s = new CountryContextEntry();
- //rc,cc1, full_name_nd_ro,dsg
- s.setRc(rs.getString(1));
- s.setCc1(rs.getString(2));
-//a.district,
- s.setFull_name_nd_ro(rs.getString(3));
-//b.name as countryname,
- s.setDsg(rs.getString(4));
- entries.add(s);
- }
-
- } catch (SQLException ex) {
- System.err.println(ex);
- } catch (Exception e) {
- System.err.println(e);
- } finally {
- con.close();
- }
- return entries;
- }
-
- public Map<String, Set<Integer>> getCountryMentions() {
- return countryMentions;
- }
-
- public Set<CountryContextEntry> getCountryHits() {
- return countryHits;
- }
-
private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
List<CountryContextEntry> entries = new ArrayList<>();
String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java Wed Dec 4 12:12:43 2013
@@ -30,7 +30,7 @@ public class CountryContextEntry {
private String cc1;
private String full_name_nd_ro;
private String dsg;
-
+ private String provCode;
public CountryContextEntry() {
}
@@ -41,6 +41,14 @@ public class CountryContextEntry {
this.dsg = dsg;
}
+ public String getProvCode() {
+ return provCode;
+ }
+
+ public void setProvCode(String provCode) {
+ this.provCode = provCode;
+ }
+
public String getRc() {
return rc;
}
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java Wed Dec 4 12:12:43 2013
@@ -36,7 +36,7 @@ public class CountryProximityScorer impl
String dominantCode = "";
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
@@ -134,10 +134,10 @@ public class CountryProximityScorer impl
/**
* the gaz matches that have a country code that have mentions in the doc
- * that are closest to the Named Entity should return the best score Analyze
- * map generates a likelihood score that the toponym from the gaz is
- * referring to one of the countries Map<countrycode, prob that this span is
- * referring to the toponym form this code key>
+ * that are closest to the Named Entity should return the best score.
+ * Analyzemap generates a likelihood score that the toponym from the gaz is
+ * referring to one of the countries, i.e, Map<countrycode, prob that this
+ * span is referring to the toponym form this code key>
*/
Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
for (BaseLink link : span.getLinkedEntries()) {
@@ -148,21 +148,16 @@ public class CountryProximityScorer impl
score = scoreMap.get(spanCountryCode);
///does the name extracted match a country name?
if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
- //if so, is it the correct country code for that name
+ //if so, is it the correct country code for that name?
if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
//boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
//TODO: make this multiplier configurable
- //TODO: improve this with a geographic/geometry based clustering (linear binning to be more precise) of points returned from the gaz
score = (score + .75) > 1.0 ? 1d : (score + .75);
- //boost the score if the hit is from the dominant country context
if (link.getItemParentID().equals(dominantCode)) {
score = (score + .25) > 1.0 ? 1d : (score + .25);
}
-
-
}
-
}
}
link.getScoreMap().put("countrycontext", score);
@@ -184,7 +179,7 @@ public class CountryProximityScorer impl
private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
Map<String, Double> scoreMap = new HashMap<String, Double>();
- if(distanceMap.isEmpty()){
+ if (distanceMap.isEmpty()) {
return scoreMap;
}
TreeSet<Integer> all = new TreeSet<Integer>();
@@ -195,8 +190,8 @@ public class CountryProximityScorer impl
Integer min = all.first();
Integer max = all.last();
- if(min==max){
- min=0;
+ if (min == max) {
+ min = 0;
}
for (String key : distanceMap.keySet()) {
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java Wed Dec 4 12:12:43 2013
@@ -72,7 +72,9 @@ public class GazateerSearcher {
/**
* build the search string
*/
- String luceneQueryString = "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^10000";
+ String luceneQueryString = !code.equals("")
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
+ : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
*/
@@ -82,7 +84,7 @@ public class GazateerSearcher {
}
if (geonamesIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
scoreCutoff = Double.valueOf(cutoff);
geonamesIndex = new MMapDirectory(new File(indexloc));
geonamesReader = DirectoryReader.open(geonamesIndex);
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java?rev=1547783&r1=1547782&r2=1547783&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java Wed Dec 4 12:12:43 2013
@@ -68,10 +68,15 @@ public class GeoEntityLinker implements
ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
// geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
- for (String code : countryMentions.keySet()) {
- if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+ if (!countryMentions.keySet().isEmpty()) {
+ for (String code : countryMentions.keySet()) {
+ if (!code.equals("us")) {
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+ }
}
+ } else {
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
+
}
}