You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/01/12 15:44:55 UTC
svn commit: r1557540 -
/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/
Author: markg
Date: Sun Jan 12 14:44:54 2014
New Revision: 1557540
URL: http://svn.apache.org/r1557540
Log:
OPENNLP-579
Many efficiencies. Fails gracefully if any resources are missing (Gazateers, countrycontext data, etc)
Updated javadocs and comments
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContext.java Sun Jan 12 14:44:54 2014
@@ -37,7 +37,6 @@ import opennlp.tools.entitylinker.Entity
*/
public class CountryContext {
-
private List<CountryContextEntry> countrydata;
private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
@@ -50,10 +49,18 @@ public class CountryContext {
return countryMentions;
}
+ /**
+ * returns the last set of hits after calling regexFind
+ *
+ * @return
+ */
public Set<CountryContextEntry> getCountryHits() {
return countryHits;
}
-
+/**
+ * returns the last name to codes map after calling regexFind
+ * @return
+ */
public Map<String, Set<String>> getNameCodesMap() {
return nameCodesMap;
}
@@ -63,10 +70,14 @@ public class CountryContext {
}
/**
- * Finds mentions of countries based on a list from MySQL stored procedure
- * called getCountryList. This method finds country mentions in documents,
- * which is an essential element of the scoring that is done for geo
- * linkedspans. Lazily loads the list from the database.
+ * Finds mentions of countries to assist in toponym resolution. Countries are
+ * discovered via regex based on a configured file called
+ * opennlp.geoentitylinker.countrycontext.txt. the file is configured using
+ * the entitylinker.properties file as such:
+ * opennlp.geoentitylinker.countrycontext.filepath=/opt/opennlp/opennlp.geoentitylinker.countrycontext.txt
+ *
+ * Finding mentions in documents is very helpful for scoring. Lazily loads the
+ * list from the file.
*
* @param docText the full text of the document
* @param properties EntityLinkerProperties for getting database connection
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryContextEntry.java Sun Jan 12 14:44:54 2014
@@ -18,7 +18,7 @@ package opennlp.addons.geoentitylinker;
import java.util.Objects;
/**
- *Stores a tuple from mysql that is used to find country mentions in document text.
+ *Stores a tuple from the opennlp.geoentitylinker.countrycontext.txt file, which is used to find country mentions in document text.
*
*/
public class CountryContextEntry {
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/CountryProximityScorer.java Sun Jan 12 14:44:54 2014
@@ -28,7 +28,13 @@ import opennlp.tools.entitylinker.domain
import opennlp.tools.util.Span;
/**
- * Scores toponyms based on country context as well as fuzzy string matching
+ * Scores toponyms based on their proximity to a country mention. Based on the
+ * heuristic that typonymn mentions are more likely close to their parent
+ * country mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Germany, it is more likely to be Berlin Germany than Berlin
+ * Connecticut.
+ *
+ *
*/
public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
@@ -45,8 +51,7 @@ public class CountryProximityScorer impl
/**
* Assigns a score to each BaseLink in each linkedSpan's set of N best
* matches. Currently the scoring indicates the probability that the toponym
- * is correct based on the country context in the document and fuzzy string
- * matching
+ * is correct based on the country context in the document
*
* @param linkedData the linked spans, holds the Namefinder results, and
* the list of BaseLink for each
@@ -101,9 +106,11 @@ public class CountryProximityScorer impl
*/
private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
Double score = 0.0;
- //get the index of the actual span, begining of sentence
- //should generate tokens from sentence and create a char offset...
- //could have large sentences due to poor sentence detection or wonky doc text
+ /*
+ * get the index of the actual span, begining of sentence //should generate
+ * tokens from sentence and create a char offset... //could have large
+ * sentences due to poor sentence detection or wonky doc text
+ */
int sentenceIdx = span.getSentenceid();
int sentIndexInDoc = sentences[sentenceIdx].getStart();
/**
@@ -151,7 +158,7 @@ public class CountryProximityScorer impl
//if so, is it the correct country code for that name?
if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
//boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
- //TODO: make this multiplier configurable
+ //TODO: make this smarter, and utilize province/state info in the future to be even more specific
score = (score + .75) > 1.0 ? 1d : (score + .75);
if (link.getItemParentID().equals(dominantCode)) {
@@ -166,10 +173,10 @@ public class CountryProximityScorer impl
}
/**
- * takes a map of distances from the NE to each country mention and generates
- * a map of scores for each country code. The map is then correlated to teh
- * correlated to the code of the BaseLink parentid for retrieval. Then the
- * score is added to the overall.
+ * takes a map of distances from the toponym to each country mention and generates
+ * a map of scores for each country code. The map is then correlated to the
+ * code of the BaseLink parentid for retrieval. Then the
+ * score is added to the overall list.
*
* @param distanceMap
* @param sentences
@@ -216,7 +223,7 @@ public class CountryProximityScorer impl
* together to smooth out the average, so one distant outlier does not kill
* the score for an obviously good hit. More elegant solution is possible
* using Math.pow, and making the score decay with distance by using an
- * increasing negative exponent
+ * increasing negative exponent (I think)
*
* @param normDis the normalized and sorted set of distances as a list
* @return
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/FuzzyStringMatchScorer.java Sun Jan 12 14:44:54 2014
@@ -26,7 +26,7 @@ import opennlp.tools.util.Span;
/**
*
- * Generates scores for string comparisons.
+ * Generates scores based on string comparisons levenstein and dice
*/
public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerEntry.java Sun Jan 12 14:44:54 2014
@@ -21,7 +21,7 @@ import opennlp.tools.entitylinker.domain
/**
*
- * Stores a record from a geographic placenames gazateer
+ * Stores a minimal amount of information from a geographic placenames gazateer
*/
public class GazateerEntry extends BaseLink {
@@ -29,46 +29,91 @@ public class GazateerEntry extends BaseL
private Double longitude;
private String source;
private String indexID;
- private Map<String, String> indexData=new HashMap<>();
+ private Map<String, String> indexData = new HashMap<>();
+ /**
+ * returns the id from the lucene document
+ *
+ * @return
+ */
public String getIndexID() {
return indexID;
}
+ /*
+ * sets the id from the lucene document
+ */
public void setIndexID(String indexID) {
this.indexID = indexID;
}
+ /**
+ * returns the latitude from the gazateer
+ *
+ * @return
+ */
public Double getLatitude() {
return latitude;
}
+ /**
+ * sets the latitude from the gazateer
+ *
+ */
public void setLatitude(Double latitude) {
this.latitude = latitude;
}
+ /**
+ * returns the longitude from the gaz
+ *
+ * @return
+ */
public Double getLongitude() {
return longitude;
}
+ /**
+ * sets the longitude from the gaz
+ *
+ * @param longitude
+ */
public void setLongitude(Double longitude) {
this.longitude = longitude;
}
+ /**
+ * returns the source of the gazateer data
+ *
+ * @return
+ */
public String getSource() {
return source;
}
+ /**
+ * sets the source (the source of the gazateer data)
+ *
+ * @param source
+ */
public void setSource(String source) {
this.source = source;
}
+ /**
+ * Returns all the other fields in the gazateer in the form of a map
+ *
+ * @return
+ */
public Map<String, String> getIndexData() {
return indexData;
}
+ /**
+ * sets the other fields in the gazateer in the form of a map
+ *
+ */
public void setIndexData(Map<String, String> indexData) {
this.indexData = indexData;
}
-
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerIndexer.java Sun Jan 12 14:44:54 2014
@@ -79,7 +79,13 @@ public class GazateerIndexer {
}
}
}
-
+/**
+ * indexes the USGS or Geonames gazateers.
+ * @param outputIndexDir a DIRECTORY path where you would like to store the output lucene indexes
+ * @param gazateerInputData the file, "as is" that was downloaded from the USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES format
+ * @throws Exception
+ */
public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearchCache.java Sun Jan 12 14:44:54 2014
@@ -21,13 +21,17 @@ import java.util.Map;
/**
*
- * Caches gazateer query results statically
+ * Caches gazateer query results statically. Clears itself if more than 10000 results are cached.
*/
public class GazateerSearchCache {
private static Map<String, ArrayList<GazateerEntry>> gazCache = new HashMap<>();
-
+/**
+ * returns the cached entries. Returns null if the query does not exists in the cache
+ * @param searchString
+ * @return
+ */
public static synchronized ArrayList<GazateerEntry> get(String searchString) {
return gazCache.get(searchString);
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazateerSearcher.java Sun Jan 12 14:44:54 2014
@@ -39,7 +39,7 @@ import opennlp.tools.entitylinker.Entity
/**
*
- * Searches Gazateers stored in a MMapDirectory lucene index
+ * Searches Gazateers stored in a MMapDirectory Lucene index
*/
public class GazateerSearcher {
@@ -59,18 +59,19 @@ public class GazateerSearcher {
/**
*
- * @param searchString the nameed entity to look up in the lucene index
+ * @param searchString the named entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
* @param code the country code
- * @param properties properties file that states where the lucene indexes
- * are
+ * @param properties the entitylinker.properties file that states where the
+ * lucene indexes are
* @return
*/
public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
ArrayList<GazateerEntry> linkedData = new ArrayList<>();
try {
/**
- * build the search string
+ * build the search string Sometimes no country context is found. In this
+ * case the code variable will be an empty string
*/
String luceneQueryString = !code.equals("")
? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase() + "^1000"
@@ -80,25 +81,28 @@ public class GazateerSearcher {
*/
ArrayList<GazateerEntry> get = GazateerSearchCache.get(searchString);
if (get != null) {
+
return get;
}
if (geonamesIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
- String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".60");
+ if(indexloc.equals("")){
+ System.out.println("Geonames Gaz location not found");
+ return linkedData;
+ }
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", String.valueOf(scoreCutoff));
scoreCutoff = Double.valueOf(cutoff);
geonamesIndex = new MMapDirectory(new File(indexloc));
geonamesReader = DirectoryReader.open(geonamesIndex);
geonamesSearcher = new IndexSearcher(geonamesReader);
+ //TODO: a language code switch statement should be employed here at some point
geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
}
-
-
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
-
TopDocs search = geonamesSearcher.search(q, rowsReturned);
double maxScore = (double) search.getMaxScore();
@@ -118,6 +122,12 @@ public class GazateerSearcher {
for (int idx = 0; idx < fields.size(); idx++) {
String value = d.get(fields.get(idx).name());
value = value.toLowerCase();
+ /**
+ * these positions map to the required fields in the gaz TODO: allow a
+ * configurable list of columns that map to the GazateerEntry fields,
+ * then users would be able to plug in any gazateer they have (if they
+ * build a lucene index out of it)
+ */
switch (idx) {
case 1:
entry.setItemID(value);
@@ -140,7 +150,7 @@ public class GazateerSearcher {
}
entry.getIndexData().put(fields.get(idx).name(), value);
}
- //only keep it if the country code is a match
+ //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
linkedData.add(entry);
}
@@ -182,6 +192,10 @@ public class GazateerSearcher {
}
if (usgsIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+ if(indexloc.equals("")){
+ System.out.println("USGS Gaz location not found");
+ return linkedData;
+ }
String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
scoreCutoff = Double.valueOf(cutoff);
usgsIndex = new MMapDirectory(new File(indexloc));
@@ -264,15 +278,31 @@ public class GazateerSearcher {
}
}
+ /**
+ * gets rid of entries that are below the score thresh
+ *
+ * @param linkedData
+ */
private void prune(ArrayList<GazateerEntry> linkedData) {
for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
GazateerEntry ge = itr.next();
+ /**
+ * throw away anything under the configured score thresh
+ */
if (ge.getScoreMap().get("lucene") < scoreCutoff) {
itr.remove();
}
}
}
+ /**
+ * normalizes the different levenstein scores returned from the query into a
+ *
+ * @param valueToNormalize the raw score
+ * @param minimum the min of the range of scores
+ * @param maximum the max of the range
+ * @return the normed score
+ */
private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
d = d == null ? 0d : d;
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Sun Jan 12 14:44:54 2014
@@ -26,8 +26,10 @@ import opennlp.tools.entitylinker.Entity
import opennlp.tools.entitylinker.EntityLinker;
/**
- * Links location entities to gazatteers. Currently supports gazateers in a
- * MySql database (NGA and USGS)
+ * Links location entities to the USGS and GeoNames gazatteers, and uses several
+ * scoring techniques to enable resolution. The gazateers are stored in lucene
+ * indexes. The indexes can be built using the GeoEntityLinkerSetupUtils class
+ * in this same package.
*
*
*/
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinkerSetupUtils.java Sun Jan 12 14:44:54 2014
@@ -37,29 +37,45 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import static opennlp.addons.geoentitylinker.ModelBasedScorer.RADIUS;
-
/**
*
* Tools for setting up GeoEntityLinker gazateers and doccat scoring model
*/
public class GeoEntityLinkerSetupUtils {
+
public static ModelBasedScorer scorer;
static {
scorer = new ModelBasedScorer();
}
- public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
- GazateerIndexer indexer = new GazateerIndexer();
- try {
- indexer.index(outputIndexDir, gazateerInputData, type);
- } catch (Exception ex) {
- ex.printStackTrace();
- }
+
+ /**
+ * Generates the lucene indexes of the USGS and GEONAMES gazateers.
+ *
+ * @param outputIndexDir the destination directory of the index. Must be a
+ * directory
+ * @param gazateerInputData the input data file. Must be in geonames gaz
+ * format, or USGS format
+ * @param type the type, USGS, or GEONAMES
+ */
+ public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type) {
+ GazateerIndexer indexer = new GazateerIndexer();
+ try {
+ indexer.index(outputIndexDir, gazateerInputData, type);
+ } catch (Exception ex) {
+ ex.printStackTrace();
}
- /**
+ }
+
+ /**
+ * Generates a doccat model from proximal features generated from surrounding
+ * context of country mentions. This model is used as a basis for a score
+ * called coutrymodel, which takes the context from around a toponym, and uses
+ * this model to return a score for the country code of the toponym hit in the
+ * gazateer.
*
* @param documents A list of document texts, for best results try to
- * ensure each country you care about will be
+ * ensure each country you care about will be well
* represented in the collection
* @param annotationOutFile the location where the annotated doccat text file
* will be stored
@@ -83,7 +99,7 @@ public class GeoEntityLinkerSetupUtils {
}
}
}
- System.out.println("Document processing complete. Writing training data to "+ annotationOutFile.getAbsolutePath());
+ System.out.println("Document processing complete. Writing training data to " + annotationOutFile.getAbsolutePath());
writer.close();
System.out.println("Building Doccat model...");
DoccatModel model = null;
@@ -98,7 +114,7 @@ public class GeoEntityLinkerSetupUtils {
model = DocumentCategorizerME.train("en", sampleStream);
OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
model.serialize(modelOut);
- System.out.println("Model complete!");
+ System.out.println("Model complete!");
} catch (IOException e) {
// Failed to read or parse training data, training failed
e.printStackTrace();
@@ -142,5 +158,4 @@ public class GeoEntityLinkerSetupUtils {
}
return featureBags;
}
-
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoHashBinningScorer.java Sun Jan 12 14:44:54 2014
@@ -28,18 +28,19 @@ import opennlp.tools.entitylinker.domain
import opennlp.tools.util.Span;
/**
- *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality
- * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the
- * Lucene search is set to an appropriate level so this class if not fed poor data.
+ * Scores toponymns based on geographic point binning. Based on the heuristic
+ * that docs are generally about a small amount of locations, so one can detect
+ * outliers by finding those points that are not near the majority
+ *
*/
public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
@Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
- score( linkedSpans);
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+ score(linkedSpans);
}
- private void score(List<LinkedSpan> geospans) {
+ private void score(List<LinkedSpan> geospans) {
Map<Double, Double> latLongs = new HashMap<Double, Double>();
/**
@@ -50,7 +51,7 @@ public class GeoHashBinningScorer implem
if (bl instanceof GazateerEntry) {
GazateerEntry entry = (GazateerEntry) bl;
latLongs.put(entry.getLatitude(), entry.getLongitude());
-
+
}
}
}
@@ -77,7 +78,7 @@ public class GeoHashBinningScorer implem
if (bl instanceof GazateerEntry) {
GazateerEntry entry = (GazateerEntry) bl;
geohash = geoHash(entry.getLatitude(), entry.getLongitude());
-
+
}
if (scores.containsKey(geohash)) {
score = scores.get(geohash);
@@ -158,9 +159,9 @@ public class GeoHashBinningScorer implem
for (Long l : diffs) {
sum += l;
}
- Long avg=sum;
- if(!diffs.isEmpty()){
- avg = sum / diffs.size();
+ Long avg = sum;
+ if (!diffs.isEmpty()) {
+ avg = sum / diffs.size();
}
@@ -273,4 +274,3 @@ public class GeoHashBinningScorer implem
return d;
}
}
-
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/LinkedEntityScorer.java Sun Jan 12 14:44:54 2014
@@ -28,6 +28,7 @@ public interface LinkedEntityScorer<T> {
/**
* Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
+ * this method internally affects the reference to linkedSpans that was passed in
* @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
* @param docText the full text of the document.
* @param sentenceSpans the sentence spans the correspond to the document text
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java?rev=1557540&r1=1557539&r2=1557540&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/ModelBasedScorer.java Sun Jan 12 14:44:54 2014
@@ -37,6 +37,7 @@ public class ModelBasedScorer implements
DocumentCategorizerME documentCategorizerME;
DoccatModel doccatModel;
public static final int RADIUS = 100;
+ boolean modelexists = false;
@Override
public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
@@ -44,8 +45,10 @@ public class ModelBasedScorer implements
if (doccatModel == null) {
String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
if (path.equals("")) {
- System.err.println(this.getClass().getSimpleName() + ": could not find property \"opennlp.geoentitylinker.modelbasedscorer.modelpath\" : no ModelBasedScoring will be performed");
-
+ if (!modelexists) {
+ System.err.println(this.getClass().getSimpleName() + ": could not find property \"opennlp.geoentitylinker.modelbasedscorer.modelpath\" : no ModelBasedScoring will be performed");
+ }
+ modelexists = true;
return;
}
doccatModel = new DoccatModel(new File(path));