You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2014/03/07 13:15:36 UTC

svn commit: r1575244 - in /opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker: GazetteerIndexer.java GazetteerSearcher.java GeoEntityLinker.java

Author: markg
Date: Fri Mar  7 12:15:35 2014
New Revision: 1575244

URL: http://svn.apache.org/r1575244
Log:
OPENNLP-664
Fixed, now country codes are no longer ignored.

Modified:
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
    opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerIndexer.java Fri Mar  7 12:15:35 2014
@@ -23,9 +23,9 @@ import java.util.List;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -94,7 +94,7 @@ public class GazetteerIndexer {
     String indexloc = outputIndexDir + type.toString();
     Directory index = new MMapDirectory(new File(indexloc));
 
-    Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+    Analyzer a = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
 
     IndexWriter w = new IndexWriter(index, config);
@@ -107,9 +107,8 @@ public class GazetteerIndexer {
 
   public void readFile(File gazateerInputData, IndexWriter w, GazType type) throws Exception {
     BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
-    List<String> fields = new ArrayList<String>();
+    List<String> fields = new ArrayList<>();
     int counter = 0;
-    // int langCodeIndex = 0;
     System.out.println("reading gazetteer data from file...........");
     while (reader.read() != -1) {
       String line = reader.readLine();
@@ -137,14 +136,4 @@ public class GazetteerIndexer {
     System.out.println("Completed indexing gaz! index name is: " + type.toString());
   }
 
-  /**
-   * TODO: make these analyzers configurable
-   */
-//  private void loadAnalyzerMap() {
-////    languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
-////    languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
-////    languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
-////    languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
-//
-//  }
 }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GazetteerSearcher.java Fri Mar  7 12:15:35 2014
@@ -35,6 +35,7 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.Version;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
+import org.apache.lucene.analysis.util.CharArraySet;
 
 /**
  *
@@ -72,18 +73,14 @@ public class GazetteerSearcher {
    */
   public ArrayList<GazetteerEntry> geonamesFind(String searchString, int rowsReturned, String code) {
     ArrayList<GazetteerEntry> linkedData = new ArrayList<>();
-    if (code.toLowerCase().equals("in") && searchString.toLowerCase().equals("india")) {
-      rowsReturned=100;
-      System.out.println("india");
-    }
-    String luceneQueryString = "";
+
     try {
       /**
        * build the search string Sometimes no country context is found. In this
        * case the code variable will be an empty string
        */
-      luceneQueryString = !code.equals("")
-              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
+      String luceneQueryString = !code.equals("")
+              ? "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim() + " AND CC1:" + code.toLowerCase()//+"^90000" //[\"" + code.toLowerCase()+"\" TO \"" + code.toLowerCase() + "\"]"
               : "FULL_NAME_ND_RO:" + searchString.toLowerCase().trim();
       /**
        * check the cache and go no further if the records already exist
@@ -97,7 +94,6 @@ public class GazetteerSearcher {
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);
 
-
       TopDocs search = geonamesSearcher.search(q, rowsReturned);
 
       for (int i = 0; i < search.scoreDocs.length; ++i) {
@@ -135,7 +131,7 @@ public class GazetteerSearcher {
               break;
             case 12:
               entry.setItemParentID(value);
-              if(!value.toLowerCase().equals(code.toLowerCase())){
+              if (!value.toLowerCase().equals(code.toLowerCase())) {
                 continue;
               }
               break;
@@ -153,8 +149,6 @@ public class GazetteerSearcher {
          * only want hits above the levenstein thresh
          */
         if (normLev.compareTo(scoreCutoff) >= 0) {
-          //only keep it if the country code is a match. even when the code is passed in as a weighted condition, there is no == equiv in lucene
-
           if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
             entry.getScoreMap().put("normlucene", normLev);
             //make sure we don't produce a duplicate
@@ -182,7 +176,6 @@ public class GazetteerSearcher {
    * @param searchString the nameed entity to look up in the lucene index
    * @param rowsReturned how many rows to allow lucene to return
    *
-   * @param properties   properties file that states where the lucene indexes
    * @return
    */
   public ArrayList<GazetteerEntry> usgsFind(String searchString, int rowsReturned) {
@@ -278,7 +271,7 @@ public class GazetteerSearcher {
       usgsIndex = new MMapDirectory(new File(indexloc));
       usgsReader = DirectoryReader.open(usgsIndex);
       usgsSearcher = new IndexSearcher(usgsReader);
-      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+      usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
     }
     if (geonamesIndex == null) {
       String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
@@ -292,7 +285,7 @@ public class GazetteerSearcher {
       geonamesReader = DirectoryReader.open(geonamesIndex);
       geonamesSearcher = new IndexSearcher(geonamesReader);
       //TODO: a language code switch statement should be employed here at some point
-      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+      geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45, new CharArraySet(Version.LUCENE_45, new ArrayList(), true));
 
     }
   }

Modified: opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java?rev=1575244&r1=1575243&r2=1575244&view=diff
==============================================================================
--- opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/addons/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/GeoEntityLinker.java Fri Mar  7 12:15:35 2014
@@ -38,14 +38,7 @@ public class GeoEntityLinker implements 
   private EntityLinkerProperties linkerProperties;
   private GazetteerSearcher gazateerSearcher;
   private List<LinkedEntityScorer> scorers = new ArrayList<>();
-  /**
-   * Flag for deciding whether to search gaz only for toponyms within countries
-   * that are mentioned in the document
-   */
- // private Boolean filterCountryContext = true;
 
-  public GeoEntityLinker() throws Exception {
-  }
 
   @Override
   public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
@@ -68,18 +61,18 @@ public class GeoEntityLinker implements 
          * US is the only country mentioned in the doc
          *
          */
-        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+        ArrayList<BaseLink> geoNamesEntries = new ArrayList<>();
         if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1)
                 || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
         
           if (!countryMentions.keySet().isEmpty()) {
             for (String code : countryMentions.keySet()) {
               if (!code.equals("us")) {
-                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, code));
+                geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code));
               }
             }
           } else {
-            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 3, ""));
+            geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, ""));
 
           }
 
@@ -115,7 +108,6 @@ public class GeoEntityLinker implements 
 
   private void loadScorers() {
     if (scorers.isEmpty()) {
-    //  scorers.add(new FuzzyStringMatchScorer());
       scorers.add(new GeoHashBinningScorer());
       scorers.add(new CountryProximityScorer());
       scorers.add(new ModelBasedScorer());