You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/03/07 13:15:36 UTC
svn commit: r1575244 - in
/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker:
GazetteerIndexer.java GazetteerSearcher.java GeoEntityLinker.java
Author: markg
Date: Fri Mar 7 12:15:35 2014
New Revision: 1575244
URL: http://svn.apache.org/r1575244
Log:
OPENNLP-664
Fixed, now country codes are no longer ignored.
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java Fri Mar 7 12:15:35 2014
@@ -23,9 +23,9 @@ import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -94,7 +94,7 @@ public class GazetteerIndexer {
String indexloc = outputIndexDir + type.toString();
Directory index = new MMapDirectory(new File(indexloc));
- Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
IndexWriter w = new IndexWriter(index, config);
@@ -107,9 +107,8 @@ public class GazetteerIndexer {
public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<String>();
+ List<String> fields = new ArrayList<>();
int counter = 0;
- // int langCodeIndex = 0;
System.out.println("reading gazetteer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
@@ -137,14 +136,4 @@ public class GazetteerIndexer {
System.out.println("Completed indexing gaz! index name is: " + type.toString());
}
- /**
- * TODO: make these analyzers configurable
- */
-// private void loadAnalyzerMap() {
-//// languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
-//// languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
-//
-// }
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Fri Mar 7 12:15:35 2014
@@ -35,6 +35,7 @@ import org.apache.lucene.store.Directory
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import opennlp.tools.entitylinker.EntityLinkerProperties;
+import org.apache.lucene.analysis.util.CharArraySet;
/**
*
@@ -72,18 +73,14 @@ public class GazetteerSearcher {
*/
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
- if (code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")) {
- rowsReturned=100;
- System.out.println("india");
- }
- String luceneQueryString = "";
+
try {
/**
* build the search string Sometimes no country context is found. In this
* case the code variable will be an empty string
*/
- luceneQueryString = !code.equals("")
- ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
+ String luceneQueryString = !code.equals("")
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
@@ -97,7 +94,6 @@ public class GazetteerSearcher {
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
-
TopDocs search = geonamesSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; ++i) {
@@ -135,7 +131,7 @@ public class GazetteerSearcher {
break;
case 12:
entry.setItemParentID(value);
- if(!value.toLowerCase().equals(code.toLowerCase())){
+ if (!value.toLowerCase().equals(code.toLowerCase())) {
continue;
}
break;
@@ -153,8 +149,6 @@ public class GazetteerSearcher {
* only want hits above the levenstein thresh
*/
if (normLev.compareTo(scoreCutoff) >= 0) {
- //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
-
if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
entry.getScoreMap().put("normlucene", normLev);
//make sure we don't produce a duplicate
@@ -182,7 +176,6 @@ public class GazetteerSearcher {
* @param searchString the nameed entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
*
- * @param properties properties file that states where the lucene indexes
* @return
*/
public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
@@ -278,7 +271,7 @@ public class GazetteerSearcher {
usgsIndex = new MMapDirectory(new File(indexloc));
usgsReader = DirectoryReader.open(usgsIndex);
usgsSearcher = new IndexSearcher(usgsReader);
- usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
}
if (geonamesIndex == null) {
String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
@@ -292,7 +285,7 @@ public class GazetteerSearcher {
geonamesReader = DirectoryReader.open(geonamesIndex);
geonamesSearcher = new IndexSearcher(geonamesReader);
//TODO: a language code switch statement should be employed here at some point
- geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
}
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Fri Mar 7 12:15:35 2014
@@ -38,14 +38,7 @@ public class GeoEntityLinker implements
private EntityLinkerProperties linkerProperties;
private GazetteerSearcher gazateerSearcher;
private List<LinkedEntityScorer> scorers = new ArrayList<>();
- /**
- * Flag for deciding whether to search gaz only for toponyms within countries
- * that are mentioned in the document
- */
- // private Boolean filterCountryContext = true;
- public GeoEntityLinker() throws Exception {
- }
@Override
public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -68,18 +61,18 @@ public class GeoEntityLinker implements
* US is the only country mentioned in the doc
*
*/
- ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+ ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
|| countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
if (!countryMentions.keySet().isEmpty()) {
for (String code : countryMentions.keySet()) {
if (!code.equals("us")) {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, code));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
}
}
} else {
- geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, ""));
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
}
@@ -115,7 +108,6 @@ public class GeoEntityLinker implements
private void loadScorers() {
if (scorers.isEmpty()) {
- // scorers.add(new FuzzyStringMatchScorer());
scorers.add(new GeoHashBinningScorer());
scorers.add(new CountryProximityScorer());
scorers.add(new ModelBasedScorer());