You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@opennlp.apache.org by Jörn Kottmann <ko...@gmail.com> on 2013/10/22 19:47:47 UTC
Re: svn commit: r1533959 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: entitylinker/ entitylinker/domain/ ngram/

You added the NGramGenerator class to the ngrams package,
but we already have a a class the NGramModel to create ngrams.

Would it be possible for you to use that one instead, so we avoid 
duplication?

Jörn

On 10/20/2013 10:04 PM, markg@apache.org wrote:
> Author: markg
> Date: Sun Oct 20 20:04:41 2013
> New Revision: 1533959
>
> URL: http://svn.apache.org/r1533959
> Log:
> OPENNLP-579
> GeoEntityLinkerImpl: Implemented better scoring using Dice coefficient of bigram, as well as highly improved scoring based on country context. Created an NgramGenerator class and a FuzzyStringMatching class, assuming they would be useful for other linker impls. Implemented Regex based discovery of countrycontext, which enabled proximity based analysis of doctext
> Multiple other small efficiencies in the GeoEntityLinker
>
> Added:
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
> Modified:
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
>      opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/CountryContext.java Sun Oct 20 20:04:41 2013
> @@ -21,23 +21,42 @@ import java.sql.DriverManager;
>   import java.sql.ResultSet;
>   import java.sql.SQLException;
>   import java.util.ArrayList;
> +import java.util.HashMap;
> +import java.util.HashSet;
>   import java.util.List;
> +import java.util.Map;
> +import java.util.Set;
>   import java.util.logging.Level;
>   import java.util.logging.Logger;
> +import java.util.regex.Matcher;
> +import java.util.regex.Pattern;
>   
>   /**
> - *Finds instances of country mentions in a String, typically a document text.
> + * Finds instances of country mentions in a String, typically a document text.
>    * Used to boost or degrade scoring of linked geo entities
> -
> + *
>    */
>   public class CountryContext {
>   
>     private Connection con;
>     private List<CountryContextEntry> countrydata;
> +  private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
> +
> +  public Map<String, Set<String>> getNameCodesMap() {
> +    return nameCodesMap;
> +  }
> +
> +  public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
> +    this.nameCodesMap = nameCodesMap;
> +  }
>   
>     public CountryContext() {
>     }
>   
> +  /**
> +   * use regexFind
> +   */
> +  @Deprecated
>     public List<CountryContextHit> find(String docText, EntityLinkerProperties properties) {
>       List<CountryContextHit> hits = new ArrayList<CountryContextHit>();
>       try {
> @@ -51,7 +70,7 @@ public class CountryContext {
>   
>           if (docText.contains(entry.getFull_name_nd_ro())) {
>             System.out.println("\tFound Country indicator: " + entry.getFull_name_nd_ro());
> -          CountryContextHit hit = new CountryContextHit(entry.getCc1(), docText.indexOf(entry.getFull_name_nd_ro()), docText.indexOf(entry.getFull_name_nd_ro()+ entry.getFull_name_nd_ro().length()));
> +          CountryContextHit hit = new CountryContextHit(entry.getCc1(), docText.indexOf(entry.getFull_name_nd_ro()), docText.indexOf(entry.getFull_name_nd_ro() + entry.getFull_name_nd_ro().length()));
>             hits.add(hit);
>           }
>         }
> @@ -60,6 +79,81 @@ public class CountryContext {
>         Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
>       }
>       return hits;
> +
> +  }
> +/**
> + * Finds mentions of countries based on a list from MySQL stored procedure called getCountryList. This method finds country mentions in documents,
> + * which is an essential element of the scoring that is done for geo linkedspans. Lazily loads the list from the database.
> + * @param docText the full text of the document
> + * @param properties EntityLinkerProperties for getting database connection
> + * @return
> + */
> +  public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
> +    Map<String, Set<Integer>> hits = new HashMap<String, Set<Integer>>();
> +    try {
> +      if (con == null) {
> +        con = getMySqlConnection(properties);
> +      }
> +      if (countrydata == null) {
> +        countrydata = getCountryData(properties);
> +      }
> +      for (CountryContextEntry entry : countrydata) {
> +        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
> +        Matcher rs = regex.matcher(docText);
> +        String code = entry.getCc1().toLowerCase();
> +        while (rs.find()) {
> +          Integer start = rs.start();
> +          String hit = rs.group().toLowerCase();
> +          if (hits.containsKey(code)) {
> +            hits.get(code).add(start);
> +          } else {
> +            Set<Integer> newset = new HashSet<Integer>();
> +            newset.add(start);
> +            hits.put(code, newset);
> +          }
> +          if (!hit.equals("")) {
> +            if (this.nameCodesMap.containsKey(hit)) {
> +              nameCodesMap.get(hit).add(code);
> +            } else {
> +              HashSet<String> newset = new HashSet<String>();
> +              newset.add(code);
> +              nameCodesMap.put(hit, newset);
> +            }
> +          }
> +        }
> +
> +      }
> +
> +    } catch (Exception ex) {
> +      Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
> +    }
> +
> +    //System.out.println(hits);
> +    return hits;
> +  }
> +/**
> + * returns a unique list of country codes
> + * @param hits the hits discovered
> + * @return
> + */
> +  public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
> +    Set<String> ccs = new HashSet<String>();
> +    for (CountryContextHit hit : hits) {
> +      ccs.add(hit.getCountryCode().toLowerCase());
> +    }
> +    return ccs;
> +  }
> +
> +  public static String getCountryCodeCSV(Set<String> hits) {
> +    String csv = "";
> +    if (hits.isEmpty()) {
> +      return csv;
> +    }
> +
> +    for (String code : hits) {
> +      csv += "," + code;
> +    }
> +    return csv.substring(1);
>     }
>   
>     private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {
> @@ -73,7 +167,12 @@ public class CountryContext {
>       Connection conn = DriverManager.getConnection(url, username, password);
>       return conn;
>     }
> -
> +/**
> + * reads the list from the database by calling a stored procedure getCountryList
> + * @param properties
> + * @return
> + * @throws SQLException
> + */
>     private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {
>       List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
>       try {
>
> Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java?rev=1533959&view=auto
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java (added)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/FuzzyStringMatcher.java Sun Oct 20 20:04:41 2013
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright 2013 The Apache Software Foundation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package opennlp.tools.entitylinker;
> +
> +import java.util.HashSet;
> +import java.util.List;
> +import java.util.Set;
> +import opennlp.tools.ngram.NGramGenerator;
> +
> +/**
> + *
> + *Generates scores for string comparisons.
> + */
> +public class FuzzyStringMatcher {
> +/**
> + * Generates a score based on an overlap of nGrams between two strings using the DiceCoefficient technique.
> + *
> + * @param s1 first string
> + * @param s2 second string
> + * @param nGrams number of chars in each gram
> + * @return
> + */
> +  public static double getDiceCoefficient(String s1, String s2, int nGrams) {
> +    if (s1.equals("") || s1.equals("")) {
> +      return 0d;
> +    }
> +    List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
> +    List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
> +
> +    Set<String> overlap = new HashSet<String>(s1Grams);
> +    overlap.retainAll(s2Grams);
> +    double totcombigrams = overlap.size();
> +
> +    return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
> +  }
> +}
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityLinker.java Sun Oct 20 20:04:41 2013
> @@ -19,6 +19,8 @@ import java.io.File;
>   import java.io.IOException;
>   import java.util.ArrayList;
>   import java.util.List;
> +import java.util.Map;
> +import java.util.Set;
>   import java.util.logging.Level;
>   import java.util.logging.Logger;
>   import opennlp.tools.entitylinker.domain.BaseLink;
> @@ -26,17 +28,24 @@ import opennlp.tools.entitylinker.domain
>   import opennlp.tools.util.Span;
>   
>   /**
> - * Links location entities to gazatteers.
> + * Links location entities to gazatteers. Currently supports gazateers in a
> + * MySql database (NGA and USGS)
>    *
>    *
>    */
>   public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
>   
> +  GeoEntityScorer scorer = new GeoEntityScorer();
>     private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();
>     private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();
>     private CountryContext countryContext;
> -  private List<CountryContextHit> hits;
> -  private EntityLinkerProperties props;
> +  private Map<String, Set<Integer>> countryMentions;
> +  private EntityLinkerProperties linkerProperties;
> +  /**
> +   * Flag for deciding whether to search gaz only for toponyms within countries
> +   * that are mentioned in the document
> +   */
> +  private Boolean filterCountryContext=true;
>   
>     public GeoEntityLinker() {
>       if (geoNamesGaz == null || usgsGaz == null) {
> @@ -50,25 +59,44 @@ public class GeoEntityLinker implements
>     public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] names) {
>       ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
>       try {
> -      if (props == null) {
> -        props = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
> +      if (linkerProperties == null) {
> +        linkerProperties = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
>         }
> -      if (hits == null) {
> -        System.out.println("getting country context");
> -        hits = countryContext.find(text, props);
> -      }
> -
> +
> +        countryMentions = countryContext.regexfind(text, linkerProperties);
> +
> +      //prioritize query
> +      filterCountryContext = Boolean.valueOf(linkerProperties.getProperty("geoentitylinker.filter_by_country_context", "true"));
>         String[] matches = Span.spansToStrings(names, tokens);
>         for (int i = 0; i < matches.length; i++) {
> -        System.out.println("processing match " + i + " of " + matches.length);
> -        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], names[i], hits, props);
> -        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], hits, props);
> -        LinkedSpan<BaseLink> geoSpans = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
> -        geoSpans.getLinkedEntries().addAll(usgsEntries);
> -        geoSpans.setSearchTerm(matches[i]);
> -        spans.add(geoSpans);
> +
> +//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
> +        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
> +        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
> +          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
> +        if (countryMentions.keySet().contains("us")) {
> +          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
> +
> +        if (!usgsEntries.isEmpty()) {
> +          geoSpan.getLinkedEntries().addAll(usgsEntries);
> +          geoSpan.setSearchTerm(matches[i]);
> +        }
> +
> +        if (!geoSpan.getLinkedEntries().isEmpty()) {
> +          geoSpan.setSearchTerm(matches[i]);
> +          spans.add(geoSpan);
> +        }
> +
>         }
> -      return spans;
> +      //score the spans
> +
> +      scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), text, sentences, 1000);
> +
> +      //  return spans;
>       } catch (IOException ex) {
>         Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, null, ex);
>       }
> @@ -78,12 +106,14 @@ public class GeoEntityLinker implements
>     public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] names) {
>       ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
>       try {
> -
> -
> -      if (props == null) {
> -        props = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
> +      if (linkerProperties == null) {
> +        linkerProperties = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
>         }
> -      List<CountryContextHit> hits = countryContext.find(text, props);
> +
> +        //  System.out.println("getting country context");
> +        //hits = countryContext.find(text, linkerProperties);
> +        countryMentions = countryContext.regexfind(text, linkerProperties);
> +
>         //get the sentence text....must assume some index
>         Span s = sentences[0];
>         String sentence = text.substring(s.getStart(), s.getEnd());
> @@ -92,17 +122,32 @@ public class GeoEntityLinker implements
>         //get the names based on the tokens
>         String[] matches = Span.spansToStrings(names, stringtokens);
>         for (int i = 0; i < matches.length; i++) {
> -        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], names[i], hits, props);
> -        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], hits, props);
> -        LinkedSpan<BaseLink> geoSpans = new LinkedSpan<BaseLink>(geoNamesEntries, names[i], 0);
> -        geoSpans.getLinkedEntries().addAll(usgsEntries);
> -        geoSpans.setSearchTerm(matches[i]);
> -        spans.add(geoSpans);
> +        //nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
> +        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
> +        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
> +          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
> +        if (countryMentions.keySet().contains("us")) {
> +          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
> +
> +        if (!usgsEntries.isEmpty()) {
> +          geoSpan.getLinkedEntries().addAll(usgsEntries);
> +          geoSpan.setSearchTerm(matches[i]);
> +        }
> +
> +        if (!geoSpan.getLinkedEntries().isEmpty()) {
> +          geoSpan.setSearchTerm(matches[i]);
> +          spans.add(geoSpan);
> +        }
>         }
> -      return spans;
> +
>       } catch (IOException ex) {
>         Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, null, ex);
>       }
> +    scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), text, sentences, 1000);
>       return spans;
>     }
>   
> @@ -110,10 +155,11 @@ public class GeoEntityLinker implements
>       ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
>       try {
>   
> -      if (props == null) {
> -        props = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
> +      if (linkerProperties == null) {
> +        linkerProperties = new EntityLinkerProperties(new File("C:\\temp\\opennlpmodels\\entitylinker.properties"));
>         }
> -      List<CountryContextHit> hits = countryContext.find(text, props);
> +
> +      countryMentions = countryContext.regexfind(text, linkerProperties);
>   
>         Span s = sentences[sentenceIndex];
>         String sentence = text.substring(s.getStart(), s.getEnd());
> @@ -123,15 +169,29 @@ public class GeoEntityLinker implements
>         String[] matches = Span.spansToStrings(names, stringtokens);
>   
>         for (int i = 0; i < matches.length; i++) {
> -        ArrayList<BaseLink> geoNamesEntries = geoNamesGaz.find(matches[i], names[i], hits, props);
> -        ArrayList<BaseLink> usgsEntries = usgsGaz.find(matches[i], names[i], hits, props);
> -        LinkedSpan<BaseLink> geoSpans = new LinkedSpan<BaseLink>(geoNamesEntries, names[i], 0);
> -        geoSpans.getLinkedEntries().addAll(usgsEntries);
> -        geoSpans.setSearchTerm(matches[i]);
> -        geoSpans.setSentenceid(sentenceIndex);
> -        spans.add(geoSpans);
> +//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
> +        ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
> +        if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1) {
> +          geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
> +        if (countryMentions.keySet().contains("us")) {
> +          usgsEntries = usgsGaz.find(matches[i], names[i], countryMentions, linkerProperties);
> +        }
> +        LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
> +
> +        if (!usgsEntries.isEmpty()) {
> +          geoSpan.getLinkedEntries().addAll(usgsEntries);
> +          geoSpan.setSearchTerm(matches[i]);
> +        }
> +
> +        if (!geoSpan.getLinkedEntries().isEmpty()) {
> +          geoSpan.setSearchTerm(matches[i]);
> +          geoSpan.setSentenceid(sentenceIndex);
> +          spans.add(geoSpan);
> +        }
>         }
> -
> +      scorer.score(spans, countryMentions, countryContext.getNameCodesMap(), text, sentences, 2000);
>       } catch (IOException ex) {
>         Logger.getLogger(GeoEntityLinker.class.getName()).log(Level.SEVERE, null, ex);
>       }
> @@ -139,6 +199,6 @@ public class GeoEntityLinker implements
>     }
>   
>     public void setEntityLinkerProperties(EntityLinkerProperties properties) {
> -    this.props = properties;
> +    this.linkerProperties = properties;
>     }
>   }
>
> Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java?rev=1533959&view=auto
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java (added)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/GeoEntityScorer.java Sun Oct 20 20:04:41 2013
> @@ -0,0 +1,256 @@
> +/*
> + * Copyright 2013 The Apache Software Foundation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package opennlp.tools.entitylinker;
> +
> +import java.util.ArrayList;
> +import java.util.HashMap;
> +import java.util.HashSet;
> +import java.util.List;
> +import java.util.Map;
> +import java.util.Set;
> +import java.util.TreeSet;
> +import opennlp.tools.entitylinker.domain.BaseLink;
> +import opennlp.tools.entitylinker.domain.LinkedSpan;
> +import opennlp.tools.util.Span;
> +
> +/**
> + * Scores toponyms based on country context as well as fuzzy string matching
> + */
> +public class GeoEntityScorer {
> +
> +  private Map<String, Set<String>> nameCodesMap;
> +  String dominantCode = "";
> +
> +  /**
> +   * Assigns a score to each BaseLink in each linkedSpan's set of N best
> +   * matches. Currently the scoring indicates the probability that the toponym
> +   * is correct based on the country context in the document and fuzzy string matching
> +   *
> +   * @param linkedData     the linked spans, holds the Namefinder results, and
> +   *                       the list of BaseLink for each
> +   * @param countryHits    all the country mentions in the document
> +   * @param nameCodesMap   maps a country indicator name to a country code. Used
> +   *                       to determine if the namefinder found the same exact
> +   *                       toponym the country context did. If so the score is
> +   *                       boosted due to the high probability that the
> +   *                       NameFinder actually "rediscovered" a country
> +   * @param docText        the full text of the document...not used in this
> +   *                       default implementation
> +   * @param sentences      the sentences that correspond to the doc text.
> +   * @param maxAllowedDist a constant that is used to determine which country
> +   *                       mentions, based on proximity within the text, should
> +   *                       be used to score the Named Entity.
> +   * @return
> +   */
> +  public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
> +    this.nameCodesMap = nameCodesMap;
> +    setDominantCode(countryHits);
> +    for (LinkedSpan<BaseLink> linkedspan : linkedData) {
> +
> +      for (BaseLink link : linkedspan.getLinkedEntries()) {
> +        Double dice = FuzzyStringMatcher.getDiceCoefficient(linkedspan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
> +        /**
> +         * Since MySQL is using "boolean mode" this score will always be very
> +         * high. To allow more recall, change mysql to "natural language mode",
> +         * and this score will become more significant
> +         */
> +        link.setFuzzyStringMatchingScore(dice);
> +
> +      }
> +      linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
> +    }
> +    return linkedData;
> +  }
> +/**
> + * sets class level variable to a code based on the number of mentions
> + * @param countryHits
> + */
> +  private void setDominantCode(Map<String, Set<Integer>> countryHits) {
> +    int hits = -1;
> +    for (String code : countryHits.keySet()) {
> +      if (countryHits.get(code).size() > hits) {
> +        hits = countryHits.get(code).size();
> +        dominantCode = code;
> +      }
> +    }
> +  }
> +
> +  /**
> +   * Generates distances from each country mention to the span's location in the
> +   * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
> +   * are resolved to the correct country and coordinate.
> +   *
> +   * @param sentences
> +   * @param countryHits
> +   * @param span
> +   * @return
> +   */
> +  private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
> +    //get the index of the actual span, begining of sentence
> +    //should generate tokens from sentence and create a char offset...
> +    //could have large sentences due to poor sentence detection or wonky doc text
> +    int sentenceIdx = span.getSentenceid();
> +    int sentIndexInDoc = sentences[sentenceIdx].getStart();
> +    /**
> +     * create a map of all the span's proximal country mentions in the document
> +     * Map< countrycode, set of <distances from this NamedEntity>>
> +     */
> +    Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
> +    //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
> +    for (String cCode : countryHits.keySet()) {
> +//iterate over all the regex start values and calculate an offset
> +      for (Integer cHit : countryHits.get(cCode)) {
> +        Integer absDist = Math.abs(sentIndexInDoc - cHit);
> +        //only include near mentions based on a heuristic
> +        //TODO make this a property
> +        //  if (absDist < maxAllowedDistance) {
> +        if (distancesFromCodeMap.containsKey(cCode)) {
> +          distancesFromCodeMap.get(cCode).add(absDist);
> +        } else {
> +          HashSet<Integer> newset = new HashSet<Integer>();
> +          newset.add(absDist);
> +          distancesFromCodeMap.put(cCode, newset);
> +        }
> +      }
> +
> +      //}
> +    }
> +    //we now know how far this named entity is from every country mention in the document
> +
> +    /**
> +     * the gaz matches that have a country code that have mentions in the doc
> +     * that are closest to the Named Entity should return the best score Analyze
> +     * map generates a likelihood score that the toponym from the gaz is
> +     * referring to one of the countries Map<countrycode, prob that this span is
> +     * referring to the toponym form this code key>
> +     */
> +    Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
> +    for (BaseLink link : span.getLinkedEntries()) {
> +      //getItemParentId is the country code
> +      String spanCountryCode = link.getItemParentID();
> +      if (scoreMap.containsKey(spanCountryCode)) {
> +        link.setScore(scoreMap.get(spanCountryCode));
> +        ///does the name extracted match a country name?
> +        if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
> +          //if so, is it the correct country code for that name
> +          if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
> +            //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
> +            //TODO: make this multiplier configurable
> +            //TODO: improve this with a geographic/geometry based clustering (linear binning to be more precise) of points returned from the gaz
> +            Double score = (link.getScore() + .75) > 1.0 ? 1d : (link.getScore() + .75);
> +            //boost the score if the hit is from the dominant country context
> +
> +            if(link.getItemParentID().equals(dominantCode)){
> +              score = (score + .25) > 1.0 ? 1d : (score + .25);
> +            }
> +            link.setScore(score);
> +
> +          }
> +
> +        }
> +      }
> +    }
> +    return span;
> +  }
> +
> +  /**
> +   * takes a map of distances from the NE to each country mention and generates
> +   * a map of scores for each country code. The map is then correlated to teh
> +   * correlated to the code of the BaseLink parentid for retrieval. Then the
> +   * score is added to the overall.
> +   *
> +   * @param distanceMap
> +   * @param sentences
> +   * @param span
> +   * @return
> +   */
> +  private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
> +
> +    Map<String, Double> scoreMap = new HashMap<String, Double>();
> +    TreeSet<Integer> all = new TreeSet<Integer>();
> +    for (String key : distanceMap.keySet()) {
> +      all.addAll(distanceMap.get(key));
> +    }
> +    //get min max for normalization, this could be more efficient
> +    Integer min = all.first();
> +    Integer max = all.last();
> +    for (String key : distanceMap.keySet()) {
> +
> +      TreeSet<Double> normalizedDistances = new TreeSet<Double>();
> +      for (Integer i : distanceMap.get(key)) {
> +        Double norm = normalize(i, min, max);
> +        //reverse the normed distance so low numbers (closer) are better
> +        //this could be improved with a "decaying " function using an imcreaseing negative exponent
> +        Double reverse = Math.abs(norm - 1);
> +        normalizedDistances.add(reverse);
> +      }
> +
> +
> +      List<Double> doubles = new ArrayList<Double>(normalizedDistances);
> +      scoreMap.put(key, slidingDistanceAverage(doubles));
> +    }
> +    return scoreMap;
> +  }
> +
> +  /**
> +   * this method is an attempt to make closer clusters of mentions group
> +   * together to smooth out the average, so one distant outlier does not kill
> +   * the score for an obviously good hit. More elegant solution is possible
> +   * using Math.pow, and making the score decay with distance by using an
> +   * increasing negative exponent
> +   *
> +   * @param normDis the normalized and sorted set of distances as a list
> +   * @return
> +   */
> +  private Double slidingDistanceAverage(List<Double> normDis) {
> +    List<Double> windowOfAverages = new ArrayList<Double>();
> +
> +    if (normDis.size() < 3) {
> +      windowOfAverages.addAll(normDis);
> +    } else {
> +
> +      for (int i = 0; i < normDis.size() - 1; i++) {
> +        double a = normDis.get(i);
> +        double b = normDis.get(i + 1);
> +        windowOfAverages.add((a + b) / 2);
> +
> +      }
> +    }
> +    double sum = 0d;
> +    for (double d : windowOfAverages) {
> +      sum += d;
> +    }
> +    double result = sum / windowOfAverages.size();
> +    //TODO: ++ prob when large amounts of mentions for a code
> +    //System.out.println("avg of window:" + result);
> +    return result;
> +  }
> +
> +  /**
> +   * transposes a value within one range to a relative value in a different
> +   * range. Used to normalize distances in this class.
> +   *
> +   * @param valueToNormalize the value to place within the new range
> +   * @param minimum          the min of the set to be transposed
> +   * @param maximum          the max of the set to be transposed
> +   * @return
> +   */
> +  private Double normalize(int valueToNormalize, int minimum, int maximum) {
> +    Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
> +    d = d == null ? 0d : d;
> +    return d;
> +  }
> +}
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazEntry.java Sun Oct 20 20:04:41 2013
> @@ -18,59 +18,31 @@ package opennlp.tools.entitylinker;
>   import opennlp.tools.entitylinker.domain.BaseLink;
>   
>   /**
> - *
> + *Stores an entry from the NGA Geonames gazateer
>   
>    */
>   public class MySQLGeoNamesGazEntry extends BaseLink
>   {
> -  ////actual fields returned
> -//ufi,
> -//latitude,
> -//longitude,
> -//cc1,
> -//adm1,
> -//dsg,
> -//SHORT_FORM ,
> -//	SORT_NAME_RO ,
> -//	FULL_NAME_RO ,
> -//	FULL_NAME_ND_RO ,
> -//	SORT_NAME_RG ,
> -//	FULL_NAME_RG ,
> -//	FULL_NAME_ND_RG ,
> -//match(`SHORT_FORM` ,`SORT_NAME_RO`,`FULL_NAME_RO`,`FULL_NAME_ND_RO` ,`SORT_NAME_RG` ,`FULL_NAME_RG` ,`FULL_NAME_ND_RG`)
> -//against(pSearch in natural language mode) as rank
> -
> -  ///////
> -
> - // private String RC;// VARCHAR(150) NULL DEFAULT NULL,
> +
>     private String UFI;
> -  //private String UNI;
> +
>     private Double LATITUDE; //DOUBLE NULL DEFAULT NULL,
>     private Double LONGITUDE;// DOUBLE NULL DEFAULT NULL,
> - // private String DMS_LAT;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String DMS_LONG;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String MGRS;// VARCHAR(150) NULL DEFAULT NULL,
> -//  private String JOG;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String FC;// VARCHAR(150) NULL DEFAULT NULL,
> +
>     private String DSG;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String PC;// VARCHAR(150) NULL DEFAULT NULL,
> +
>     private String CC1;//` VARCHAR(150) NULL DEFAULT NULL,
>     private String ADM1;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String POP;// VARCHAR(150) NULL DEFAULT NULL,
> -  //private String ELEV;//VARCHAR(150) NULL DEFAULT NULL,
> -//  private String CC2;// VARCHAR(150) NULL DEFAULT NULL,
> - // private String NT;//VARCHAR(150) NULL DEFAULT NULL,
> - // private String LC;// VARCHAR(150) NULL DEFAULT NULL,
> +
>     private String SHORT_FORM;// VARCHAR(500) NULL DEFAULT NULL,
> - // private String GENERIC;// VARCHAR(150) NULL DEFAULT NULL,
> +
>     private String SORT_NAME_RO;//VARCHAR(500) NULL DEFAULT NULL,
>     private String FULL_NAME_RO;// VARCHAR(500) NULL DEFAULT NULL,
>     private String FULL_NAME_ND_RO;// VARCHAR(500) NULL DEFAULT NULL,
>     private String SORT_NAME_RG;// VARCHAR(500) NULL DEFAULT NULL,
>     private String FULL_NAME_RG;// VARCHAR(500) NULL DEFAULT NULL,
>     private String FULL_NAME_ND_RG;// VARCHAR(500) NULL DEFAULT NULL,
> -//  private String NOTE;//VARCHAR(500) NULL DEFAULT NULL,
> - // private String MODIFY_DATE;// VARCHAR(150) NULL DEFAULT NULL,
> +
>   private Double rank;
>   
>     public String getUFI()
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLGeoNamesGazLinkable.java Sun Oct 20 20:04:41 2013
> @@ -1,17 +1,13 @@
>   package opennlp.tools.entitylinker;
>   
> -/**
> - *
> - * @author Owner
> - */
> +
>   import java.sql.CallableStatement;
>   import java.sql.Connection;
>   import java.sql.DriverManager;
>   import java.sql.ResultSet;
>   import java.sql.SQLException;
>   import java.util.ArrayList;
> -import java.util.HashSet;
> -import java.util.List;
> +import java.util.Map;
>   import java.util.Set;
>   import java.util.logging.Level;
>   import java.util.logging.Logger;
> @@ -20,7 +16,7 @@ import opennlp.tools.util.Span;
>   
>   /**
>    *
> - *
> + *Links names to the NGA gazateer
>    */
>   public final class MySQLGeoNamesGazLinkable {
>   
> @@ -30,7 +26,7 @@ public final class MySQLGeoNamesGazLinka
>     public MySQLGeoNamesGazLinkable() {
>     }
>   
> -  public ArrayList<BaseLink> find(String locationText, Span span, List<CountryContextHit> countryHits, EntityLinkerProperties properties) {
> +  public ArrayList<BaseLink> find(String locationText, Span span, Map<String, Set<Integer>> countryHits, EntityLinkerProperties properties) {
>       ArrayList<BaseLink> returnlocs = new ArrayList<BaseLink>();
>   
>       try {
> @@ -40,13 +36,13 @@ public final class MySQLGeoNamesGazLinka
>         //   pull from config to utilize country context filtering
>         filterCountryContext = Boolean.valueOf(properties.getProperty("geoentitylinker.filter_by_country_context", "false"));
>   
> -      Set<String> countrycodes = getCountryCodes(countryHits);
> +
>         String thresh = properties.getProperty("mysqlusgsgazscorethresh", "25");
>         int threshhold = -1;
>         if (!thresh.matches("[azAZ]")) {
>           threshhold = Integer.valueOf(thresh);
>         }
> -      returnlocs.addAll(this.searchGaz(locationText, threshhold, countrycodes, properties));
> +      returnlocs.addAll(this.searchGaz(locationText, threshhold, countryHits.keySet(), properties));
>   
>   
>       } catch (Exception ex) {
> @@ -56,7 +52,7 @@ public final class MySQLGeoNamesGazLinka
>     }
>   
>     protected Connection getMySqlConnection(EntityLinkerProperties property) throws Exception {
> -   // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
> +    // EntityLinkerProperties property = new EntityLinkerProperties(new File("c:\\temp\\opennlpmodels\\entitylinker.properties"));
>       String driver = property.getProperty("mysql.driver", "org.gjt.mm.mysql.Driver");
>       String url = property.getProperty("mysql.url", "jdbc:mysql://localhost:3306/world");
>       String username = property.getProperty("mysql.username", "root");
> @@ -73,16 +69,23 @@ public final class MySQLGeoNamesGazLinka
>         con = getMySqlConnection(properties);
>       }
>       CallableStatement cs;
> -    cs = con.prepareCall("CALL `search_geonames`(?, ?)");
> +    cs = con.prepareCall("CALL `search_geonames`(?, ?, ?)");
>       cs.setString(1, this.format(searchString));
>       cs.setInt(2, matchthresh);
> -    ArrayList<MySQLGeoNamesGazEntry> retLocs = new ArrayList<MySQLGeoNamesGazEntry>();
> +    if (filterCountryContext) {
> +      cs.setString(3,CountryContext.getCountryCodeCSV(countryCodes));
> +    } else {
> +      //database stored procedure handles empty string
> +      cs.setString(3, "");
> +    }
> +
> +    ArrayList<MySQLGeoNamesGazEntry> toponyms = new ArrayList<MySQLGeoNamesGazEntry>();
>       ResultSet rs;
>       try {
>         rs = cs.executeQuery();
>   
>         if (rs == null) {
> -        return retLocs;
> +        return toponyms;
>         }
>   
>         while (rs.next()) {
> @@ -117,17 +120,13 @@ public final class MySQLGeoNamesGazLinka
>   
>           s.setRank(rs.getDouble(14));
>   
> -        if (filterCountryContext) {
> -          if (countryCodes.contains(s.getCC1().toLowerCase())) {
> -          //  System.out.println(searchString +" GeoNames qualified on: " + s.getCC1());
> -            s.setRank(s.getRank() + 1.0);
> -          } else {
> -         //    System.out.println(s.getFULL_NAME_ND_RO() + ", with CC1 of "+ s.getCC1()+ ", is not within countries discovered in the document. The Country list used to discover countries can be modified in mysql procedure getCountryList()");
> -            continue;
> -          }
> -        }
> -
> -        retLocs.add(s);
> +            //set the base link data
> +        s.setItemName(s.getFULL_NAME_ND_RO().toLowerCase().trim());
> +        s.setItemID(s.getUFI());
> +        s.setItemType(s.getDSG());
> +        s.setItemParentID(s.getCC1().toLowerCase());
> +
> +        toponyms.add(s);
>         }
>   
>       } catch (SQLException ex) {
> @@ -138,16 +137,10 @@ public final class MySQLGeoNamesGazLinka
>         con.close();
>       }
>   
> -    return retLocs;
> +    return toponyms;
>     }
>   
> -  private Set<String> getCountryCodes(List<CountryContextHit> hits) {
> -    Set<String> ccs = new HashSet<String>();
> -    for (CountryContextHit hit : hits) {
> -      ccs.add(hit.getCountryCode().toLowerCase());
> -    }
> -    return ccs;
> -  }
> +
>   
>     public String format(String entity) {
>       return "\"" + entity + "\"";
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazEntry.java Sun Oct 20 20:04:41 2013
> @@ -18,7 +18,7 @@ package opennlp.tools.entitylinker;
>   import opennlp.tools.entitylinker.domain.BaseLink;
>   
>   /**
> - *
> + *Stores an entry from the USGS gazateer
>   
>    */
>   public class MySQLUSGSGazEntry extends BaseLink
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/MySQLUSGSGazLinkable.java Sun Oct 20 20:04:41 2013
> @@ -23,6 +23,7 @@ import java.sql.SQLException;
>   import java.util.ArrayList;
>   import java.util.HashSet;
>   import java.util.List;
> +import java.util.Map;
>   import java.util.Set;
>   import java.util.logging.Level;
>   import java.util.logging.Logger;
> @@ -30,8 +31,7 @@ import opennlp.tools.entitylinker.domain
>   import opennlp.tools.util.Span;
>   
>   /**
> - *
> - * @author opennlp
> + * Links names to the USGS gazateer
>    */
>   public class MySQLUSGSGazLinkable {
>   
> @@ -41,12 +41,12 @@ public class MySQLUSGSGazLinkable {
>     public MySQLUSGSGazLinkable() {
>     }
>   
> -  public ArrayList<BaseLink> find(String locationText, Span span, List<CountryContextHit> countryHits, EntityLinkerProperties properties) {
> +  public ArrayList<BaseLink> find(String locationText, Span span, Map<String, Set<Integer>> countryHits, EntityLinkerProperties properties) {
>       ArrayList<BaseLink> returnlocs = new ArrayList<BaseLink>();
>       try {
>         filterCountryContext = Boolean.valueOf(properties.getProperty("geoentitylinker.filter_by_country_context", "false"));
>         //the usgs gazateer only has us geonames, so only use it if the user doesn't care about country isolation or the hits contain us
> -      if (getCountryCodes(countryHits).contains("us") || !filterCountryContext) {
> +      if (countryHits.keySet().contains("us") || !filterCountryContext) {
>   
>           if (con == null) {
>             con = getMySqlConnection(properties);
> @@ -56,7 +56,7 @@ public class MySQLUSGSGazLinkable {
>           if (!thresh.matches("[azAZ]")) {
>             threshhold = Integer.valueOf(thresh);
>           }
> -        returnlocs.addAll(this.searchGaz(locationText, threshhold, getCountryCodes(countryHits), properties));
> +        returnlocs.addAll(this.searchGaz(locationText, threshhold, countryHits.keySet(), properties));
>         }
>       } catch (Exception ex) {
>         Logger.getLogger(MySQLUSGSGazLinkable.class.getName()).log(Level.SEVERE, null, ex);
> @@ -84,13 +84,13 @@ public class MySQLUSGSGazLinkable {
>       cs = con.prepareCall("CALL `search_gaz`(?, ?)");
>       cs.setString(1, this.format(searchString));
>       cs.setInt(2, matchthresh);
> -    ArrayList<MySQLUSGSGazEntry> retUrls = new ArrayList<MySQLUSGSGazEntry>();
> +    ArrayList<MySQLUSGSGazEntry> toponyms = new ArrayList<MySQLUSGSGazEntry>();
>       ResultSet rs;
>       try {
>         rs = cs.executeQuery();
>   
>         if (rs == null) {
> -        return retUrls;
> +        return toponyms;
>         }
>   
>         while (rs.next()) {
> @@ -99,21 +99,20 @@ public class MySQLUSGSGazLinkable {
>   
>           s.setFeatureid(String.valueOf(rs.getLong(2)));
>           s.setFeaturename(rs.getString(3));
> +
>           s.setFeatureclass(rs.getString(4));
>           s.setStatealpha(rs.getString(5));
>           s.setPrimarylatitudeDEC(rs.getDouble(6));
>           s.setPrimarylongitudeDEC(rs.getDouble(7));
>           s.setMapname(rs.getString(8));
> -        if (countryCodes.contains("us")) {
> -          s.setRank(s.getRank() + (s.getRank() * .5));
> -         // System.out.println(searchString +"USGS qualified on: " + s.getFeaturename());
> -        } else {
> -          s.setRank(s.getRank() * .5);
> -          if(filterCountryContext){
> -            continue;
> -          }
> -        }
> -        retUrls.add(s);
> +
> +        //set the base link data
> +        s.setItemName(s.getFeaturename().toLowerCase().trim());
> +        s.setItemID(s.getFeatureid());
> +        s.setItemType(s.getFeatureclass());
> +        s.setItemParentID("us");
> +
> +        toponyms.add(s);
>         }
>   
>       } catch (SQLException ex) {
> @@ -124,7 +123,7 @@ public class MySQLUSGSGazLinkable {
>         con.close();
>       }
>   
> -    return retUrls;
> +    return toponyms;
>     }
>   
>     private Set<String> getCountryCodes(List<CountryContextHit> hits) {
>
> Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java?rev=1533959&r1=1533958&r2=1533959&view=diff
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java (original)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/entitylinker/domain/BaseLink.java Sun Oct 20 20:04:41 2013
> @@ -13,29 +13,48 @@
>    * See the License for the specific language governing permissions and
>    * limitations under the License.
>    */
> -
>   package opennlp.tools.entitylinker.domain;
>   
>   /**
>    * Stores a minimal tuple of information. Intended to be used with LinkedSpan
>    *
> -
> + *
>    */
>   public abstract class BaseLink {
>   
> +  private String itemParentID;
>     private String itemID;
>     private String itemName;
>     private String itemType;
> +  private Double score;
> +  private Double fuzzyStringMatchingScore;
>   
>     public BaseLink() {
>     }
>   
> -  public BaseLink(String itemID, String itemName, String itemType) {
> +  public BaseLink(String itemParentID, String itemID, String itemName, String itemType) {
> +    this.itemParentID = itemParentID;
>       this.itemID = itemID;
>       this.itemName = itemName;
>       this.itemType = itemType;
>     }
>   
> +  public Double getScore() {
> +    return score;
> +  }
> +
> +  public void setScore(Double score) {
> +    this.score = score;
> +  }
> +
> +  public String getItemParentID() {
> +    return itemParentID;
> +  }
> +
> +  public void setItemParentID(String itemParentID) {
> +    this.itemParentID = itemParentID;
> +  }
> +
>     /**
>      * returns the itemid
>      *
> @@ -93,10 +112,16 @@ public abstract class BaseLink {
>       this.itemType = itemType;
>     }
>   
> -
> -
>     @Override
>     public String toString() {
>       return "BaseLink{" + "itemID=" + itemID + ", itemName=" + itemName + ", itemType=" + itemType + '}';
>     }
> +
> +  public Double getFuzzyStringMatchingScore() {
> +    return fuzzyStringMatchingScore;
> +  }
> +
> +  public void setFuzzyStringMatchingScore(Double fuzzyStringMatchingScore) {
> +    this.fuzzyStringMatchingScore = fuzzyStringMatchingScore;
> +  }
>   }
> \ No newline at end of file
>
> Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java
> URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java?rev=1533959&view=auto
> ==============================================================================
> --- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java (added)
> +++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramGenerator.java Sun Oct 20 20:04:41 2013
> @@ -0,0 +1,75 @@
> +/*
> + * Copyright 2013 The Apache Software Foundation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +package opennlp.tools.ngram;
> +
> +import java.util.ArrayList;
> +import java.util.List;
> +
> +/**
> + * Generates an nGram, with optional separator, and returns the grams as a list
> + * of strings
> + */
> +public class NGramGenerator {
> +
> +
> +  /**
> +   * Creates an ngram separated
> +   * by the separator param value i.e. a,b,c,d with n = 3 and separator = "-"
> +   * would return a-b-c,b-c-d
> +   *
> +   * @param input     the input tokens the output ngrams will be derived from
> +   * @param n         the number of tokens as the sliding window
> +   * @param separator each string in each gram will be separated by this value if desired. Pass in empty string if no separator is desired
> +   * @return
> +   */
> +  public static List<String> generate(List<String> input, int n, String separator) {
> +
> +    List<String> outGrams = new ArrayList<String>();
> +    for (int i = 0; i < input.size() - (n - 2); i++) {
> +      String gram = "";
> +      if ((i + n) <= input.size()) {
> +        for (int x = i; x < (n + i); x++) {
> +          gram += input.get(x) + separator;
> +        }
> +        gram = gram.substring(0, gram.lastIndexOf(separator));
> +        outGrams.add(gram);
> +      }
> +    }
> +    return outGrams;
> +  }
> +/**
> + *Generates an nGram based on a char[] input
> + * @param input the array of chars to convert to nGram
> + * @param n The number of grams (chars) that each output gram will consist of
> + * @param separator each char in each gram will be separated by this value if desired. Pass in empty string if no separator is desired
> + * @return
> + */
> +  public static List<String> generate(char[] input, int n, String separator) {
> +
> +    List<String> outGrams = new ArrayList<String>();
> +    for (int i = 0; i < input.length - (n - 2); i++) {
> +      String gram = "";
> +      if ((i + n) <= input.length) {
> +        for (int x = i; x < (n + i); x++) {
> +          gram += input[x] + separator;
> +        }
> +        gram = gram.substring(0, gram.lastIndexOf(separator));
> +        outGrams.add(gram);
> +      }
> +    }
> +    return outGrams;
> +  }
> +}
>
>