You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/03/05 15:21:54 UTC
svn commit: r1574498 - in
/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker:
GazetteerEntry.java GazetteerIndexer.java GazetteerSearcher.java
GeoEntityLinker.java
Author: markg
Date: Wed Mar 5 14:21:54 2014
New Revision: 1574498
URL: http://svn.apache.org/r1574498
Log:
OPENNLP-630
Fixed ltoString() in linkedspan and baselink to be more friendly to the cli tool (and others).
Modified:
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java Wed Mar 5 14:21:54 2014
@@ -122,7 +122,7 @@ public class GazetteerEntry extends Base
@Override
public String toString() {
- return super.toString() + "\n GazateerEntry{\n" + "\tlatitude=" + latitude + ", \n\tlongitude=" + longitude + ", \n\tsource=" + source + ", \n\tindexID=" + indexID + ", \n\tindexData=" + indexData + "\n}";
+ return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n";
}
@Override
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java Wed Mar 5 14:21:54 2014
@@ -40,13 +40,9 @@ import org.apache.lucene.util.Version;
public class GazetteerIndexer {
public GazetteerIndexer() {
- // loadAnalyzerMap();
+
}
- /**
- * build this into a future release, causing problems at query time
- */
- // Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
public static interface Separable {
@@ -82,15 +78,15 @@ public class GazetteerIndexer {
/**
* indexes the USGS or Geonames gazateers.
*
- * @param outputIndexDir a DIRECTORY path where you would like to store the
- * output lucene indexes
- * @param gazateerInputData the file, "as is" that was downloaded from the
- * USGS and GEONAMES website
- * @param type indicates whether the data is USGS or GEONAMES
- * format
+ * @param outputIndexDir a DIRECTORY path where you would like to store
+ * the output lucene indexes
+ * @param gazetteerInputData the file, "as is" that was downloaded from the
+ * USGS and GEONAMES website
+ * @param type indicates whether the data is USGS or GEONAMES
+ * format
* @throws Exception
*/
- public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
+ public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
@@ -103,7 +99,7 @@ public class GazetteerIndexer {
IndexWriter w = new IndexWriter(index, config);
- readFile(gazateerInputData, w, type);
+ readFile(gazetteerInputData, w, type);
w.commit();
w.close();
@@ -114,31 +110,24 @@ public class GazetteerIndexer {
List<String> fields = new ArrayList<String>();
int counter = 0;
// int langCodeIndex = 0;
- System.out.println("reading gazateer data from file...........");
+ System.out.println("reading gazetteer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
String[] values = line.split(type.getSeparator());
if (counter == 0) {
- // build fields
- for (int i = 0; i < values.length; i++) {
- String columnName = values[i];
+ for (String columnName : values) {
fields.add(columnName.replace("»¿", "").trim());
-
}
} else {
Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
-
- doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
-
- }
-
- w.addDocument(doc);
-
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ w.addDocument(doc);
}
counter++;
- if (counter % 10000 == 0) {
+ if (counter % 100000 == 0) {
w.commit();
System.out.println(counter + " .........committed to index..............");
}
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Wed Mar 5 14:21:54 2014
@@ -38,7 +38,7 @@ import opennlp.tools.entitylinker.Entity
/**
*
- * Searches Gazateers stored in a MMapDirectory Lucene index. The structure of
+ * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
* these indices are based on loading the indexes using the
* GeoEntityLinkerSetupUtils
*
@@ -67,12 +67,13 @@ public class GazetteerSearcher {
* @param searchString the named entity to look up in the lucene index
* @param rowsReturned how many rows to allow lucene to return
* @param code the country code
-
+ *
* @return
*/
public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
- if(code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")){
+ if (code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")) {
+ rowsReturned=100;
System.out.println("india");
}
String luceneQueryString = "";
@@ -82,7 +83,7 @@ public class GazetteerSearcher {
* case the code variable will be an empty string
*/
luceneQueryString = !code.equals("")
- ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:\""+code.toLowerCase()+"\"" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
+ ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
: "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
/**
* check the cache and go no further if the records already exist
@@ -93,10 +94,10 @@ public class GazetteerSearcher {
return get;
}
-
QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
Query q = parser.parse(luceneQueryString);
+
TopDocs search = geonamesSearcher.search(q, rowsReturned);
for (int i = 0; i < search.scoreDocs.length; ++i) {
@@ -105,8 +106,6 @@ public class GazetteerSearcher {
double sc = search.scoreDocs[i].score;
entry.getScoreMap().put("lucene", sc);
-
-
entry.setIndexID(docId + "");
entry.setSource("geonames");
@@ -136,8 +135,8 @@ public class GazetteerSearcher {
break;
case 12:
entry.setItemParentID(value);
- if(entry.getItemParentID().equals("in")){
- System.out.println("");
+ if(!value.toLowerCase().equals(code.toLowerCase())){
+ continue;
}
break;
case 23:
@@ -191,7 +190,6 @@ public class GazetteerSearcher {
String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
try {
-
/**
* hit the cache
*/
Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Wed Mar 5 14:21:54 2014
@@ -42,7 +42,7 @@ public class GeoEntityLinker implements
* Flag for deciding whether to search gaz only for toponyms within countries
* that are mentioned in the document
*/
- private Boolean filterCountryContext = true;
+ // private Boolean filterCountryContext = true;
public GeoEntityLinker() throws Exception {
}