You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/03/05 15:21:54 UTC

svn commit: r1574498 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: GazetteerEntry.java GazetteerIndexer.java GazetteerSearcher.java GeoEntityLinker.java

Author: markg
Date: Wed Mar  5 14:21:54 2014
New Revision: 1574498

URL: http://svn.apache.org/r1574498
Log:
OPENNLP-630
Fixed ltoString() in linkedspan and baselink to be more friendly to the cli tool (and others).

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerEntry.java Wed Mar  5 14:21:54 2014
@@ -122,7 +122,7 @@ public class GazetteerEntry extends Base
   @Override
   public String toString() {
 
-    return super.toString() + "\n GazateerEntry{\n" + "\tlatitude=" + latitude + ", \n\tlongitude=" + longitude + ", \n\tsource=" + source + ", \n\tindexID=" + indexID + ", \n\tindexData=" + indexData + "\n}";
+    return super.toString() + "\n\t\tGazateerEntry\n" + "\t\tlatitude=" + latitude + ", \n\t\tlongitude=" + longitude + ", \n\t\tsource=" + source + ", \n\t\tindexID=" + indexID + ",\n\t\tindexData=" + indexData + "\n";
   }
 
   @Override

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java Wed Mar  5 14:21:54 2014
@@ -40,13 +40,9 @@ import org.apache.lucene.util.Version;
 public class GazetteerIndexer {
 
   public GazetteerIndexer() {
-    // loadAnalyzerMap();
+
   }
 
-  /**
-   * build this into a future release, causing problems at query time
-   */
-  // Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
 
   public static interface Separable {
 
@@ -82,15 +78,15 @@ public class GazetteerIndexer {
   /**
    * indexes the USGS or Geonames gazateers.
    *
-   * @param outputIndexDir    a DIRECTORY path where you would like to store the
-   *                          output lucene indexes
-   * @param gazateerInputData the file, "as is" that was downloaded from the
-   *                          USGS and GEONAMES website
-   * @param type              indicates whether the data is USGS or GEONAMES
-   *                          format
+   * @param outputIndexDir     a DIRECTORY path where you would like to store
+   *                           the output lucene indexes
+   * @param gazetteerInputData the file, "as is" that was downloaded from the
+   *                           USGS and GEONAMES website
+   * @param type               indicates whether the data is USGS or GEONAMES
+   *                           format
    * @throws Exception
    */
-  public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
+  public void index(File outputIndexDir, File gazetteerInputData, GazType type) throws Exception {
     if (!outputIndexDir.isDirectory()) {
       throw new IllegalArgumentException("outputIndexDir must be a directory.");
     }
@@ -103,7 +99,7 @@ public class GazetteerIndexer {
 
     IndexWriter w = new IndexWriter(index, config);
 
-    readFile(gazateerInputData, w, type);
+    readFile(gazetteerInputData, w, type);
     w.commit();
     w.close();
 
@@ -114,31 +110,24 @@ public class GazetteerIndexer {
     List<String> fields = new ArrayList<String>();
     int counter = 0;
     // int langCodeIndex = 0;
-    System.out.println("reading gazateer data from file...........");
+    System.out.println("reading gazetteer data from file...........");
     while (reader.read() != -1) {
       String line = reader.readLine();
       String[] values = line.split(type.getSeparator());
       if (counter == 0) {
-        // build fields
-        for (int i = 0; i < values.length; i++) {
-          String columnName = values[i];
+        for (String columnName : values) {
           fields.add(columnName.replace("»¿", "").trim());
-         
         }
 
       } else {
         Document doc = new Document();
-        for (int i = 0; i < fields.size() - 1; i++) {
-
-          doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
-
-        }
-      
-          w.addDocument(doc);
-        
+        for (int i = 0; i < fields.size() - 1; i++) {         
+          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+        }     
+        w.addDocument(doc);
       }
       counter++;
-      if (counter % 10000 == 0) {
+      if (counter % 100000 == 0) {
         w.commit();
         System.out.println(counter + " .........committed to index..............");
       }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Wed Mar  5 14:21:54 2014
@@ -38,7 +38,7 @@ import opennlp.tools.entitylinker.Entity
 
 /**
  *
- * Searches Gazateers stored in a MMapDirectory Lucene index. The structure of
+ * Searches Gazetteers stored in a MMapDirectory Lucene index. The structure of
  * these indices are based on loading the indexes using the
  * GeoEntityLinkerSetupUtils
  *
@@ -67,12 +67,13 @@ public class GazetteerSearcher {
    * @param searchString the named entity to look up in the lucene index
    * @param rowsReturned how many rows to allow lucene to return
    * @param code         the country code
-
+   *
    * @return
    */
   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
-    if(code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")){
+    if (code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")) {
+      rowsReturned=100;
       System.out.println("india");
     }
     String luceneQueryString = "";
@@ -82,7 +83,7 @@ public class GazetteerSearcher {
        * case the code variable will be an empty string
        */
       luceneQueryString = !code.equals("")
-              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:\""+code.toLowerCase()+"\"" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
+              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
               : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
       /**
        * check the cache and go no further if the records already exist
@@ -93,10 +94,10 @@ public class GazetteerSearcher {
         return get;
       }
 
-
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);
 
+
       TopDocs search = geonamesSearcher.search(q, rowsReturned);
 
       for (int i = 0; i < search.scoreDocs.length; ++i) {
@@ -105,8 +106,6 @@ public class GazetteerSearcher {
         double sc = search.scoreDocs[i].score;
 
         entry.getScoreMap().put("lucene", sc);
-
-
         entry.setIndexID(docId + "");
         entry.setSource("geonames");
 
@@ -136,8 +135,8 @@ public class GazetteerSearcher {
               break;
             case 12:
               entry.setItemParentID(value);
-              if(entry.getItemParentID().equals("in")){
-                System.out.println("");
+              if(!value.toLowerCase().equals(code.toLowerCase())){
+                continue;
               }
               break;
             case 23:
@@ -191,7 +190,6 @@ public class GazetteerSearcher {
     String luceneQueryString = "FEATURE_NAME:" + searchString.toLowerCase().trim() + " OR MAP_NAME: " + searchString.toLowerCase().trim();
     try {
 
-
       /**
        * hit the cache
        */

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1574498&r1=1574497&r2=1574498&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Wed Mar  5 14:21:54 2014
@@ -42,7 +42,7 @@ public class GeoEntityLinker implements 
    * Flag for deciding whether to search gaz only for toponyms within countries
    * that are mentioned in the document
    */
-  private Boolean filterCountryContext = true;
+ // private Boolean filterCountryContext = true;
 
   public GeoEntityLinker() throws Exception {
   }