You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/01/18 21:03:54 UTC
svn commit: r1559407 - in
/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker:
CountryContext.java CountryContextHit.java GazateerEntry.java
GazateerSearcher.java GeoEntityLinker.java GeoEntityLinkerSetupUtils.java
Author: markg
Date: Sat Jan 18 20:03:54 2014
New Revision: 1559407
URL: http://svn.apache.org/r1559407
Log:
OPENNLP-637
OPENNLP-639
Fixed and optimized GazateerSearcher to cache properly. Added hascode and equals to gazateer entry and ensured no duplicates are returned.
Removed:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextHit.java
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java Sat Jan 18 20:03:54 2014
@@ -16,6 +16,7 @@
package opennlp.addons.geoentitylinker;
import java.io.BufferedReader;
+import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
@@ -38,11 +39,19 @@ import opennlp.tools.entitylinker.Entity
public class CountryContext {
private List<CountryContextEntry> countrydata;
- private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
- private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
+ private Map<String, Set<String>> nameCodesMap = new HashMap<>();
+ private Map<String, Set<Integer>> countryMentions = new HashMap<>();
private Set<CountryContextEntry> countryHits = new HashSet<>();
+ private EntityLinkerProperties properties;
- public CountryContext() {
+ public CountryContext(EntityLinkerProperties properties) throws Exception {
+ this.properties = properties;
+ if (countrydata == null) {
+ String path = this.properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
+
+ File countryContextFile = new File(path);
+ countrydata = getCountryContextFromFile(countryContextFile);
+ }
}
public Map<String, Set<Integer>> getCountryMentions() {
@@ -57,10 +66,12 @@ public class CountryContext {
public Set<CountryContextEntry> getCountryHits() {
return countryHits;
}
-/**
- * returns the last name to codes map after calling regexFind
- * @return
- */
+
+ /**
+ * returns the last name to codes map after calling regexFind
+ *
+ * @return
+ */
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
@@ -83,15 +94,12 @@ public class CountryContext {
* @param properties EntityLinkerProperties for getting database connection
* @return
*/
- public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
+ public Map<String, Set<Integer>> regexfind(String docText) {
countryMentions = new HashMap<>();
nameCodesMap.clear();
try {
- if (countrydata == null) {
- countrydata = getCountryContextFromFile(properties);
- // countrydata = getCountryData(properties);
- }
+
for (CountryContextEntry entry : countrydata) {
Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher rs = regex.matcher(docText);
@@ -133,9 +141,9 @@ public class CountryContext {
return countryMentions;
}
- private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
+ private List<CountryContextEntry> getCountryContextFromFile(File countryContextFile) {
List<CountryContextEntry> entries = new ArrayList<>();
- String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
+ String path = countryContextFile.getPath();
BufferedReader reader;
try {
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java Sat Jan 18 20:03:54 2014
@@ -17,6 +17,7 @@ package opennlp.addons.geoentitylinker;
import java.util.HashMap;
import java.util.Map;
+import java.util.Objects;
import opennlp.tools.entitylinker.domain.BaseLink;
/**
@@ -116,4 +117,40 @@ public class GazateerEntry extends BaseL
public void setIndexData(Map<String, String> indexData) {
this.indexData = indexData;
}
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 29 * hash + Objects.hashCode(this.latitude);
+ hash = 29 * hash + Objects.hashCode(this.longitude);
+ hash = 29 * hash + Objects.hashCode(this.source);
+ hash = 29 * hash + Objects.hashCode(this.indexID);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final GazateerEntry other = (GazateerEntry) obj;
+ if (!Objects.equals(this.latitude, other.latitude)) {
+ return false;
+ }
+ if (!Objects.equals(this.longitude, other.longitude)) {
+ return false;
+ }
+ if (!Objects.equals(this.source, other.source)) {
+ return false;
+ }
+ if (!Objects.equals(this.indexID, other.indexID)) {
+ return false;
+ }
+ return true;
+ }
+
+
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java Sat Jan 18 20:03:54 2014
@@ -53,8 +53,11 @@ public class GazateerSearcher {
private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
private Analyzer usgsAnalyzer;
+ private EntityLinkerProperties properties;
- public GazateerSearcher() {
+ public GazateerSearcher(EntityLinkerProperties properties) throws Exception {
+ this.properties = properties;
+ init();
}
/**
@@ -66,39 +69,26 @@ public class GazateerSearcher {
* lucene indexes are
* @return
*/
- public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
+ public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ String luceneQueryString = "";
try {
/**
* build the search string Sometimes no country context is found. In this
* case the code variable will be an empty string
*/
- String luceneQueryString = !code.equals("")
+ luceneQueryString = !code.equals("")
? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
*/
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+ ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
if (get != null) {
-
+
return get;
}
- if (geonamesIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- if(indexloc.equals("")){
- System.out.println("Geonames Gaz location not found");
- return linkedData;
- }
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
- scoreCutoff = Double.valueOf(cutoff);
- geonamesIndex = new MMapDirectory(new File(indexloc));
- geonamesReader = DirectoryReader.open(geonamesIndex);
- geonamesSearcher = new IndexSearcher(geonamesReader);
- //TODO: a language code switch statement should be employed here at some point
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
- }
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
@@ -152,19 +142,22 @@ public class GazateerSearcher {
}
//only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
- linkedData.add(entry);
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ }
}
}
-
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
+ if (!linkedData.isEmpty()) {
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
+ }
} catch (IOException | ParseException ex) {
System.err.println(ex);
}
/**
* add the records to the cache for this query
*/
- GazateerSearchCache.put(searchString, linkedData);
+ GazateerSearchCache.put(luceneQueryString, linkedData);
return linkedData;
}
@@ -177,43 +170,27 @@ public class GazateerSearcher {
* @param properties properties file that states where the lucene indexes
* @return
*/
- public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {
+ public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned) {
ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
try {
- String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
+
/**
* hit the cache
*/
- ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
+ ArrayList<GazateerEntry> get = GazateerSearchCache.get(luceneQueryString);
if (get != null) {
//if the name is already there, return the list of cavhed results
return get;
}
- if (usgsIndex == null) {
- String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
- if(indexloc.equals("")){
- System.out.println("USGS Gaz location not found");
- return linkedData;
- }
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
- scoreCutoff = Double.valueOf(cutoff);
- usgsIndex = new MMapDirectory(new File(indexloc));
- usgsReader = DirectoryReader.open(usgsIndex);
- usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
- }
-
-
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);
Query q = parser.parse(luceneQueryString);
-
TopDocs search = usgsSearcher.search(q, rowsReturned);
double maxScore = (double) search.getMaxScore();
-
- for (int i = 0; i < search.scoreDocs.length; ++i) {
+ for (int i = 0; i < search.scoreDocs.length; i++) {
GazateerEntry entry = new GazateerEntry();
int docId = search.scoreDocs[i].doc;
double sc = search.scoreDocs[i].score;
@@ -224,8 +201,6 @@ public class GazateerSearcher {
entry.setIndexID(docId + "");
entry.setSource("usgs");
entry.setItemParentID("us");
-
-
Document d = usgsSearcher.doc(docId);
List<IndexableField> fields = d.getFields();
for (int idx = 0; idx < fields.size(); idx++) {
@@ -250,20 +225,21 @@ public class GazateerSearcher {
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
- linkedData.add(entry);
-
-
+ if (!linkedData.contains(entry)) {
+ linkedData.add(entry);
+ }
+ }
+ if (!linkedData.isEmpty()) {
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
}
-
- normalize(linkedData, 0d, maxScore);
- prune(linkedData);
} catch (IOException | ParseException ex) {
System.err.println(ex);
}
/**
* add the records to the cache for this query
*/
- GazateerSearchCache.put(searchString, linkedData);
+ GazateerSearchCache.put(luceneQueryString, linkedData);
return linkedData;
}
@@ -308,4 +284,35 @@ public class GazateerSearcher {
d = d == null ? 0d : d;
return d;
}
+
+ private void init() throws Exception {
+ if (usgsIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+ if (indexloc.equals("")) {
+ System.out.println("USGS Gaz location not found");
+
+ }
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ scoreCutoff = Double.valueOf(cutoff);
+ usgsIndex = new MMapDirectory(new File(indexloc));
+ usgsReader = DirectoryReader.open(usgsIndex);
+ usgsSearcher = new IndexSearcher(usgsReader);
+ usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ }
+ if (geonamesIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+ if (indexloc.equals("")) {
+ System.out.println("Geonames Gaz location not found");
+
+ }
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
+ scoreCutoff = Double.valueOf(cutoff);
+ geonamesIndex = new MMapDirectory(new File(indexloc));
+ geonamesReader = DirectoryReader.open(geonamesIndex);
+ geonamesSearcher = new IndexSearcher(geonamesReader);
+ //TODO: a language code switch statement should be employed here at some point
+ geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+
+ }
+ }
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Sat Jan 18 20:03:54 2014
@@ -30,15 +30,13 @@ import opennlp.tools.entitylinker.Entity
* scoring techniques to enable resolution. The gazateers are stored in lucene
* indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
* in this same package.
- *
- *
*/
-public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
+public class GeoEntityLinker implements EntityLinker<LinkedSpan, EntityLinkerProperties> {
private CountryContext countryContext;
private Map<String, Set<Integer>> countryMentions;
private EntityLinkerProperties linkerProperties;
- private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+ private GazateerSearcher gazateerSearcher;
private List<LinkedEntityScorer> scorers = new ArrayList<>();
/**
* Flag for deciding whether to search gaz only for toponyms within countries
@@ -46,8 +44,7 @@ public class GeoEntityLinker implements
*/
private Boolean filterCountryContext = true;
- public GeoEntityLinker() {
- countryContext = new CountryContext();
+ public GeoEntityLinker() throws Exception {
}
@Override
@@ -57,7 +54,7 @@ public class GeoEntityLinker implements
if (linkerProperties == null) {
throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
}
- countryMentions = countryContext.regexfind(doctext, linkerProperties);
+ countryMentions = countryContext.regexfind(doctext);
for (int s = 0; s < sentences.length; s++) {
Span[] names = namesBySentence[s];
@@ -66,28 +63,33 @@ public class GeoEntityLinker implements
for (int i = 0; i < matches.length; i++) {
-//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
+ /**
+ * nga gazateer is for other than US placenames,don't want to use it if
+ * US is the only country mentioned in the doc
+ *
+ */
ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
- if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
- // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
+ if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
+ || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
+
if (!countryMentions.keySet().isEmpty()) {
for (String code : countryMentions.keySet()) {
if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code));
}
}
} else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, "", linkerProperties));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, ""));
}
}
- ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+ ArrayList<BaseLink> usgsEntries = new ArrayList<>();
if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
//usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
- usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));
+ usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3));
}
- LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+ LinkedSpan<BaseLink> geoSpan = new LinkedSpan<>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
if (!usgsEntries.isEmpty()) {
geoSpan.getLinkedEntries().addAll(usgsEntries);
@@ -102,21 +104,34 @@ public class GeoEntityLinker implements
}
}
+ if (!scorers.isEmpty()) {
+ for (LinkedEntityScorer scorer : scorers) {
+ scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
+ }
+ }
+
+ return spans;
+ }
+
+ private void loadScorers() {
if (scorers.isEmpty()) {
scorers.add(new FuzzyStringMatchScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());
}
- for (LinkedEntityScorer scorer : scorers) {
- scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
- }
- return spans;
}
@Override
- public void setEntityLinkerProperties(EntityLinkerProperties properties) {
- this.linkerProperties = properties;
+ public void init(EntityLinkerProperties properties) {
+ try {
+ this.linkerProperties = properties;
+ countryContext = new CountryContext(this.linkerProperties);
+ gazateerSearcher = new GazateerSearcher(this.linkerProperties);
+ loadScorers();
+ } catch (Exception ex) {
+ throw new RuntimeException(ex);
+ }
}
@Override
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java?rev=1559407&r1=1559406&r2=1559407&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java Sat Jan 18 20:03:54 2014
@@ -85,13 +85,13 @@ public class GeoEntityLinkerSetupUtils {
* opennlp.geoentitylinker.countrycontext.filepath
* @throws IOException
*/
- public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
- CountryContext context = new CountryContext();
+ public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws Exception {
+ CountryContext context = new CountryContext(properties);
FileWriter writer = new FileWriter(annotationOutFile, true);
System.out.println("processing " + documents.size() + " documents");
for (String docText : documents) {
System.out.append(".");
- Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+ Map<String, Set<Integer>> regexfind = context.regexfind(docText);
Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
for (String key : modelCountryContext.keySet()) {
for (String wordbag : modelCountryContext.get(key)) {