You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/11/06 12:47:38 UTC
svn commit: r1539319 - in /opennlp/sandbox/apache-opennlp-addons: ./ src/
src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/opennlp/ src/main/java/org/apache/opennlp/addons/
src/main/java/org/apache/opennlp/...
Author: markg
Date: Wed Nov 6 11:47:37 2013
New Revision: 1539319
URL: http://svn.apache.org/r1539319
Log:
OPENNLP-614
Moved all GeoEntityLinker impl classes to sandbox. Called this module addons as a place to consolidate useful addons
to the base opennlp modules.
Added:
opennlp/sandbox/apache-opennlp-addons/pom.xml
opennlp/sandbox/apache-opennlp-addons/src/
opennlp/sandbox/apache-opennlp-addons/src/main/
opennlp/sandbox/apache-opennlp-addons/src/main/java/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
opennlp/sandbox/apache-opennlp-addons/src/test/
opennlp/sandbox/apache-opennlp-addons/src/test/java/
opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/
opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/
opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/
opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java
Modified:
opennlp/sandbox/apache-opennlp-addons/ (props changed)
Propchange: opennlp/sandbox/apache-opennlp-addons/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Nov 6 11:47:37 2013
@@ -0,0 +1 @@
+target
Added: opennlp/sandbox/apache-opennlp-addons/pom.xml
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/pom.xml?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/pom.xml (added)
+++ opennlp/sandbox/apache-opennlp-addons/pom.xml Wed Nov 6 11:47:37 2013
@@ -0,0 +1,61 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>apache-opennlp-addons</groupId>
+ <artifactId>apache-opennlp-addons</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+<name>Apache OpenNLP Addons</name>
+
+ <url>http://maven.apache.org</url>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>3.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>4.5.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>4.5.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queryparser</artifactId>
+ <version>4.5.0</version>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.6.0-SNAPSHOT</version>
+ <optional>true</optional>
+ </dependency>
+ </dependencies>
+</project>
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.sql.CallableStatement;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+
+/**
+ * Finds instances of country mentions in a String, typically a document text.
+ * Used to boost or degrade scoring of linked geo entities
+ *
+ */
+public class CountryContext {
+
+ private Connection con;
+ private List<CountryContextEntry> countrydata;
+ private Map<String, Set<String>> nameCodesMap = new HashMap<String, Set<String>>();
+ private Map<String, Set<Integer>> countryMentions = new HashMap<String, Set<Integer>>();
+ private Set<CountryContextEntry> countryHits = new HashSet<>();
+
+ public Map<String, Set<String>> getNameCodesMap() {
+ return nameCodesMap;
+ }
+
+ public void setNameCodesMap(Map<String, Set<String>> nameCodesMap) {
+ this.nameCodesMap = nameCodesMap;
+ }
+
+ public CountryContext() {
+ }
+
+
+ /**
+ * Finds mentions of countries based on a list from MySQL stored procedure
+ * called getCountryList. This method finds country mentions in documents,
+ * which is an essential element of the scoring that is done for geo
+ * linkedspans. Lazily loads the list from the database.
+ *
+ * @param docText the full text of the document
+ * @param properties EntityLinkerProperties for getting database connection
+ * @return
+ */
+ public Map<String, Set<Integer>> regexfind(String docText, EntityLinkerProperties properties) {
+ countryMentions = new HashMap<String, Set<Integer>>();
+ nameCodesMap.clear();
+ try {
+// if (con == null) {
+// con = getMySqlConnection(properties);
+// }
+ if (countrydata == null) {
+ countrydata = getCountryContextFromFile(properties);
+ // countrydata = getCountryData(properties);
+ }
+ for (CountryContextEntry entry : countrydata) {
+ Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+ Matcher rs = regex.matcher(docText);
+ String code = entry.getCc1().toLowerCase();
+
+ boolean found = false;
+ while (rs.find()) {
+ found = true;
+ Integer start = rs.start();
+ String hit = rs.group().toLowerCase();
+ if (countryMentions.containsKey(code)) {
+ countryMentions.get(code).add(start);
+ } else {
+ Set<Integer> newset = new HashSet<Integer>();
+ newset.add(start);
+ countryMentions.put(code, newset);
+ }
+ if (!hit.equals("")) {
+ if (this.nameCodesMap.containsKey(hit)) {
+ nameCodesMap.get(hit).add(code);
+ } else {
+ HashSet<String> newset = new HashSet<String>();
+ newset.add(code);
+ nameCodesMap.put(hit, newset);
+ }
+ }
+ }
+ if (found) {
+ countryHits.add(entry);
+ }
+
+ }
+
+ } catch (Exception ex) {
+ Logger.getLogger(CountryContext.class.getName()).log(Level.SEVERE, null, ex);
+ }
+
+
+ return countryMentions;
+ }
+
+ /**
+ * returns a unique list of country codes
+ *
+ * @param countryMentions the countryMentions discovered
+ * @return
+ */
+ public static Set<String> getCountryCodes(List<CountryContextHit> hits) {
+ Set<String> ccs = new HashSet<String>();
+ for (CountryContextHit hit : hits) {
+ ccs.add(hit.getCountryCode().toLowerCase());
+ }
+ return ccs;
+ }
+
+ public static String getCountryCodeCSV(Set<String> hits) {
+ String csv = "";
+ if (hits.isEmpty()) {
+ return csv;
+ }
+
+ for (String code : hits) {
+ csv += "," + code;
+ }
+ return csv.substring(1);
+ }
+
+ private Connection getMySqlConnection(EntityLinkerProperties properties) throws Exception {
+
+ String driver = properties.getProperty("db.driver", "org.gjt.mm.mysql.Driver");
+ String url = properties.getProperty("db.url", "jdbc:mysql://localhost:3306/world");
+ String username = properties.getProperty("db.username", "root");
+ String password = properties.getProperty("db.password", "?");
+
+ Class.forName(driver);
+ Connection conn = DriverManager.getConnection(url, username, password);
+ return conn;
+ }
+
+ /**
+ * reads the list from the database by calling a stored procedure
+ * getCountryList
+ *
+ * @param properties
+ * @return
+ * @throws SQLException
+ */
+ private List<CountryContextEntry> getCountryData(EntityLinkerProperties properties) throws SQLException {
+ List<CountryContextEntry> entries = new ArrayList<CountryContextEntry>();
+ try {
+ if (con == null) {
+ con = getMySqlConnection(properties);
+ }
+ CallableStatement cs;
+ cs = con.prepareCall("CALL `getCountryList`()");
+ ResultSet rs;
+ rs = cs.executeQuery();
+ if (rs == null) {
+ return entries;
+ }
+ while (rs.next()) {
+ CountryContextEntry s = new CountryContextEntry();
+ //rc,cc1, full_name_nd_ro,dsg
+ s.setRc(rs.getString(1));
+ s.setCc1(rs.getString(2));
+//a.district,
+ s.setFull_name_nd_ro(rs.getString(3));
+//b.name as countryname,
+ s.setDsg(rs.getString(4));
+ entries.add(s);
+ }
+
+ } catch (SQLException ex) {
+ System.err.println(ex);
+ } catch (Exception e) {
+ System.err.println(e);
+ } finally {
+ con.close();
+ }
+ return entries;
+ }
+
+ public Map<String, Set<Integer>> getCountryMentions() {
+ return countryMentions;
+ }
+
+ public Set<CountryContextEntry> getCountryHits() {
+ return countryHits;
+ }
+
+ private List<CountryContextEntry> getCountryContextFromFile(EntityLinkerProperties properties) {
+ List<CountryContextEntry> entries = new ArrayList<>();
+ String path = "";// properties.getProperty("geoentitylinker.countrycontext.filepath", "");
+ BufferedReader reader;
+
+ try {
+ path = properties.getProperty("opennlp.geoentitylinker.countrycontext.filepath", "");
+
+ reader = new BufferedReader(new FileReader(path));
+
+ while (reader.read() != -1) {
+ String line = reader.readLine();
+ String[] values = line.split("\t");
+ if (values.length != 4) {
+ throw new IOException("improperly formatted country context file");
+ }
+ CountryContextEntry entry = new CountryContextEntry();
+ // rc,cc1, full_name_nd_ro,dsg
+ entry.setRc(values[0].toLowerCase());
+ entry.setCc1(values[1].toLowerCase());
+ entry.setFull_name_nd_ro(values[2].toLowerCase());
+ entry.setDsg(values[3].toLowerCase());
+ entries.add(entry);
+ }
+ reader.close();
+ } catch (IOException e) {
+ System.err.println(e);
+ }
+ return entries;
+
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextEntry.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.Objects;
+
+/**
+ *Stores a tuple from mysql that is used to find country mentions in document text.
+ *
+ */
+public class CountryContextEntry {
+ /*
+ * rc,cc1, full_name_nd_ro,dsg
+ */
+
+ private String rc;
+ private String cc1;
+ private String full_name_nd_ro;
+ private String dsg;
+
+ public CountryContextEntry() {
+ }
+
+ public CountryContextEntry(String rc, String cc1, String full_name_nd_ro, String dsg) {
+ this.rc = rc;
+ this.cc1 = cc1;
+ this.full_name_nd_ro = full_name_nd_ro;
+ this.dsg = dsg;
+ }
+
+ public String getRc() {
+ return rc;
+ }
+
+ public void setRc(String rc) {
+ this.rc = rc;
+ }
+
+ public String getCc1() {
+ return cc1;
+ }
+
+ public void setCc1(String cc1) {
+ this.cc1 = cc1;
+ }
+
+ public String getFull_name_nd_ro() {
+ return full_name_nd_ro;
+ }
+
+ public void setFull_name_nd_ro(String full_name_nd_ro) {
+ this.full_name_nd_ro = full_name_nd_ro;
+ }
+
+ public String getDsg() {
+ return dsg;
+ }
+
+ public void setDsg(String dsg) {
+ this.dsg = dsg;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = 7;
+ hash = 17 * hash + Objects.hashCode(this.rc);
+ hash = 17 * hash + Objects.hashCode(this.cc1);
+ hash = 17 * hash + Objects.hashCode(this.full_name_nd_ro);
+ hash = 17 * hash + Objects.hashCode(this.dsg);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final CountryContextEntry other = (CountryContextEntry) obj;
+ if (!Objects.equals(this.rc, other.rc)) {
+ return false;
+ }
+ if (!Objects.equals(this.cc1, other.cc1)) {
+ return false;
+ }
+ if (!Objects.equals(this.full_name_nd_ro, other.full_name_nd_ro)) {
+ return false;
+ }
+ if (!Objects.equals(this.dsg, other.dsg)) {
+ return false;
+ }
+ return true;
+ }
+
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContextHit.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+/**
+ *Stores a "hit" on a country and the start and end of the hit
+
+ */
+public class CountryContextHit {
+
+ private String countryCode;
+ private int start;
+ private int end;
+
+ public CountryContextHit() {
+ }
+
+ public CountryContextHit(String countryCode, int start, int end) {
+ this.countryCode = countryCode;
+ this.start = start;
+ this.end = end;
+ }
+
+ public String getCountryCode() {
+ return countryCode;
+ }
+
+ public void setCountryCode(String countryCode) {
+ this.countryCode = countryCode;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public void setStart(int start) {
+ this.start = start;
+ }
+
+ public int getEnd() {
+ return end;
+ }
+
+ public void setEnd(int end) {
+ this.end = end;
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponyms based on country context as well as fuzzy string matching
+ */
+public class CountryProximityScorer implements LinkedEntityScorer<CountryContext> {
+
+ private Map<String, Set<String>> nameCodesMap;
+ String dominantCode = "";
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+
+ score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+
+ }
+
+ /**
+ * Assigns a score to each BaseLink in each linkedSpan's set of N best
+ * matches. Currently the scoring indicates the probability that the toponym
+ * is correct based on the country context in the document and fuzzy string
+ * matching
+ *
+ * @param linkedData the linked spans, holds the Namefinder results, and
+ * the list of BaseLink for each
+ * @param countryHits all the country mentions in the document
+ * @param nameCodesMap maps a country indicator name to a country code. Used
+ * to determine if the namefinder found the same exact
+ * toponym the country context did. If so the score is
+ * boosted due to the high probability that the
+ * NameFinder actually "rediscovered" a country
+ * @param docText the full text of the document...not used in this
+ * default implementation
+ * @param sentences the sentences that correspond to the doc text.
+ * @param maxAllowedDist a constant that is used to determine which country
+ * mentions, based on proximity within the text, should
+ * be used to score the Named Entity.
+ * @return
+ */
+ public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
+ this.nameCodesMap = nameCodesMap;
+ setDominantCode(countryHits);
+ for (LinkedSpan<BaseLink> linkedspan : linkedData) {
+
+ linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
+ }
+ return linkedData;
+ }
+
+ /**
+ * sets class level variable to a code based on the number of mentions
+ *
+ * @param countryHits
+ */
+ private void setDominantCode(Map<String, Set<Integer>> countryHits) {
+ int hits = -1;
+ for (String code : countryHits.keySet()) {
+ if (countryHits.get(code).size() > hits) {
+ hits = countryHits.get(code).size();
+ dominantCode = code;
+ }
+ }
+ }
+
+ /**
+ * Generates distances from each country mention to the span's location in the
+ * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
+ * are resolved to the correct country and coordinate.
+ *
+ * @param sentences
+ * @param countryHits
+ * @param span
+ * @return
+ */
+ private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
+ Double score = 0.0;
+ //get the index of the actual span, begining of sentence
+ //should generate tokens from sentence and create a char offset...
+ //could have large sentences due to poor sentence detection or wonky doc text
+ int sentenceIdx = span.getSentenceid();
+ int sentIndexInDoc = sentences[sentenceIdx].getStart();
+ /**
+ * create a map of all the span's proximal country mentions in the document
+ * Map< countrycode, set of <distances from this NamedEntity>>
+ */
+ Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
+ //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
+ for (String cCode : countryHits.keySet()) {
+//iterate over all the regex start values and calculate an offset
+ for (Integer cHit : countryHits.get(cCode)) {
+ Integer absDist = Math.abs(sentIndexInDoc - cHit);
+ //only include near mentions based on a heuristic
+ //TODO make this a property
+ // if (absDist < maxAllowedDistance) {
+ if (distancesFromCodeMap.containsKey(cCode)) {
+ distancesFromCodeMap.get(cCode).add(absDist);
+ } else {
+ HashSet<Integer> newset = new HashSet<Integer>();
+ newset.add(absDist);
+ distancesFromCodeMap.put(cCode, newset);
+ }
+ }
+
+ //}
+ }
+ //we now know how far this named entity is from every country mention in the document
+
+ /**
+ * the gaz matches that have a country code that have mentions in the doc
+ * that are closest to the Named Entity should return the best score Analyze
+ * map generates a likelihood score that the toponym from the gaz is
+ * referring to one of the countries Map<countrycode, prob that this span is
+ * referring to the toponym form this code key>
+ */
+ Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
+ for (BaseLink link : span.getLinkedEntries()) {
+ //getItemParentId is the country code
+ String spanCountryCode = link.getItemParentID();
+ if (scoreMap.containsKey(spanCountryCode)) {
+
+ score = scoreMap.get(spanCountryCode);
+ ///does the name extracted match a country name?
+ if (nameCodesMap.containsKey(link.getItemName().toLowerCase())) {
+ //if so, is it the correct country code for that name
+ if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
+ //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
+ //TODO: make this multiplier configurable
+ //TODO: improve this with a geographic/geometry based clustering (linear binning to be more precise) of points returned from the gaz
+ score = (score + .75) > 1.0 ? 1d : (score + .75);
+ //boost the score if the hit is from the dominant country context
+
+ if (link.getItemParentID().equals(dominantCode)) {
+ score = (score + .25) > 1.0 ? 1d : (score + .25);
+ }
+
+
+ }
+
+ }
+ }
+ link.getScoreMap().put("countrycontext", score);
+ }
+ return span;
+ }
+
+ /**
+ * takes a map of distances from the NE to each country mention and generates
+ * a map of scores for each country code. The map is then correlated to teh
+ * correlated to the code of the BaseLink parentid for retrieval. Then the
+ * score is added to the overall.
+ *
+ * @param distanceMap
+ * @param sentences
+ * @param span
+ * @return
+ */
+ private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
+
+ Map<String, Double> scoreMap = new HashMap<String, Double>();
+ if(distanceMap.isEmpty()){
+ return scoreMap;
+ }
+ TreeSet<Integer> all = new TreeSet<Integer>();
+ for (String key : distanceMap.keySet()) {
+ all.addAll(distanceMap.get(key));
+ }
+ //get min max for normalization, this could be more efficient
+ Integer min = all.first();
+ Integer max = all.last();
+ for (String key : distanceMap.keySet()) {
+
+ TreeSet<Double> normalizedDistances = new TreeSet<Double>();
+ for (Integer i : distanceMap.get(key)) {
+ Double norm = normalize(i, min, max);
+ //reverse the normed distance so low numbers (closer) are better
+ //this could be improved with a "decaying " function using an imcreaseing negative exponent
+ Double reverse = Math.abs(norm - 1);
+ normalizedDistances.add(reverse);
+ }
+
+
+ List<Double> doubles = new ArrayList<Double>(normalizedDistances);
+ scoreMap.put(key, slidingDistanceAverage(doubles));
+ }
+ return scoreMap;
+ }
+
+ /**
+ * this method is an attempt to make closer clusters of mentions group
+ * together to smooth out the average, so one distant outlier does not kill
+ * the score for an obviously good hit. More elegant solution is possible
+ * using Math.pow, and making the score decay with distance by using an
+ * increasing negative exponent
+ *
+ * @param normDis the normalized and sorted set of distances as a list
+ * @return
+ */
+ private Double slidingDistanceAverage(List<Double> normDis) {
+ List<Double> windowOfAverages = new ArrayList<Double>();
+
+ if (normDis.size() < 3) {
+ windowOfAverages.addAll(normDis);
+ } else {
+
+ for (int i = 0; i < normDis.size() - 1; i++) {
+ double a = normDis.get(i);
+ double b = normDis.get(i + 1);
+ windowOfAverages.add((a + b) / 2);
+
+ }
+ }
+ double sum = 0d;
+ for (double d : windowOfAverages) {
+ sum += d;
+ }
+ double result = sum / windowOfAverages.size();
+ //TODO: ++ prob when large amounts of mentions for a code
+ //System.out.println("avg of window:" + result);
+ return result;
+ }
+
+ /**
+ * transposes a value within one range to a relative value in a different
+ * range. Used to normalize distances in this class.
+ *
+ * @param valueToNormalize the value to place within the new range
+ * @param minimum the min of the set to be transposed
+ * @param maximum the max of the set to be transposed
+ * @return
+ */
+ private Double normalize(int valueToNormalize, int minimum, int maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.ngram.NGramGenerator;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Generates scores for string comparisons.
+ */
+public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+ for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
+ for (BaseLink link : linkedSpan.getLinkedEntries()) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);
+ link.getScoreMap().put("dice", dice);
+ Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""));
+ link.getScoreMap().put("levenshtein", ld);
+ }
+ }
+
+
+ }
+
+ /**
+ * Generates a score based on an overlap of nGrams between two strings using
+ * the DiceCoefficient technique.
+ *
+ * @param s1 first string
+ * @param s2 second string
+ * @param nGrams number of chars in each gram
+ * @return
+ */
+ public double getDiceCoefficient(String s1, String s2, int nGrams) {
+ if (s1.equals("") || s1.equals("")) {
+ return 0d;
+ }
+ List<String> s1Grams = NGramGenerator.generate(s1.toCharArray(), nGrams, "");
+ List<String> s2Grams = NGramGenerator.generate(s2.toCharArray(), nGrams, "");
+
+ Set<String> overlap = new HashSet<String>(s1Grams);
+ overlap.retainAll(s2Grams);
+ double totcombigrams = overlap.size();
+
+ return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
+ }
+
+ private int minimum(int a, int b, int c) {
+ return Math.min(Math.min(a, b), c);
+ }
+
+ public int getLevenshteinDistance(CharSequence str1,
+ CharSequence str2) {
+ int[][] distance = new int[str1.length() + 1][str2.length() + 1];
+
+ for (int i = 0; i <= str1.length(); i++) {
+ distance[i][0] = i;
+ }
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[0][j] = j;
+ }
+
+ for (int i = 1; i <= str1.length(); i++) {
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[i][j] = minimum(
+ distance[i - 1][j] + 1,
+ distance[i][j - 1] + 1,
+ distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
+ }
+ }
+
+ return distance[str1.length()][str2.length()];
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerEntry.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.HashMap;
+import java.util.Map;
+import opennlp.tools.entitylinker.domain.BaseLink;
+
+/**
+ *
+ * Stores a record from a geographic placenames gazateer
+ */
+public class GazateerEntry extends BaseLink {
+
+ private Double latitude;
+ private Double longitude;
+ private String source;
+ private String indexID;
+ private Map<String, String> indexData=new HashMap<>();
+
+ public String getIndexID() {
+ return indexID;
+ }
+
+ public void setIndexID(String indexID) {
+ this.indexID = indexID;
+ }
+
+ public Double getLatitude() {
+ return latitude;
+ }
+
+ public void setLatitude(Double latitude) {
+ this.latitude = latitude;
+ }
+
+ public Double getLongitude() {
+ return longitude;
+ }
+
+ public void setLongitude(Double longitude) {
+ this.longitude = longitude;
+ }
+
+ public String getSource() {
+ return source;
+ }
+
+ public void setSource(String source) {
+ this.source = source;
+ }
+
+ public Map<String, String> getIndexData() {
+ return indexData;
+ }
+
+ public void setIndexData(Map<String, String> indexData) {
+ this.indexData = indexData;
+ }
+
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,96 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ *
+ * @author Owner
+ */
+public class GazateerIndexer {
+
+ public enum GazType {
+
+ GEONAMES {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+ },
+ USGS {
+ @Override
+ public String toString() {
+ return "/opennlp_geoentitylinker_usgsgaz_idx";
+ }
+ }
+ }
+
+ public void index(File outputIndexDir, File gazateerInputData, GazType type) throws Exception {
+ if (!outputIndexDir.isDirectory()) {
+ throw new IllegalArgumentException("outputIndexDir must be a directory.");
+ }
+
+ String indexloc = outputIndexDir + type.toString();
+ Directory index = new MMapDirectory(new File(indexloc));
+
+ Analyzer a = new StandardAnalyzer(Version.LUCENE_45);
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, a);
+
+ IndexWriter w = new IndexWriter(index, config);
+
+ readFile(gazateerInputData, w);
+ w.commit();
+ w.close();
+
+ }
+
+ public void readFile(File gazateerInputData, IndexWriter w) throws Exception {
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<String>();
+ int counter = 0;
+ System.out.println("reading gazateer data from file...........");
+ while (reader.read() != -1) {
+ String line = reader.readLine();
+ String[] values = line.split("\\|");//nga format
+ if (counter == 0) {
+ // build fields
+ for (String columnName : values) {
+ fields.add(columnName.replace("»¿", ""));
+ }
+
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
+ }
+ w.addDocument(doc);
+ }
+ counter++;
+ if (counter % 10000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........committed to index..............");
+ }
+
+ }
+
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.util.Version;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+/**
+ *
+ * Searches Gazateers stored in a MMapDirectory lucene index
+ */
+public class GazateerSearcher {
+
+ private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();
+ private double scoreCutoff = .75;
+ private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher geonamesSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer geonamesAnalyzer;
+ //usgs US gazateer
+ private Directory usgsIndex;//= new MMapDirectory(new File(indexloc));
+ private IndexReader usgsReader;// = DirectoryReader.open(geonamesIndex);
+ private IndexSearcher usgsSearcher;// = new IndexSearcher(geonamesReader);
+ private Analyzer usgsAnalyzer;
+
+ public GazateerSearcher() {
+ }
+
+ public ArrayList<GazateerEntry> geonamesFind(String searchString, int rowsReturned, String code, EntityLinkerProperties properties) {
+ ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ try {
+
+
+ if (geonamesIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.geonames", "");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ scoreCutoff = Double.valueOf(cutoff);
+ geonamesIndex = new MMapDirectory(new File(indexloc));
+ geonamesReader = DirectoryReader.open(geonamesIndex);
+ geonamesSearcher = new IndexSearcher(geonamesReader);
+ geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ }
+
+ String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " & CC1:" + code.toUpperCase();// + "~1.0";
+ QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
+ Query q = parser.parse(luceneQueryString);
+
+
+ TopDocs search = geonamesSearcher.search(q, rowsReturned);
+ double maxScore = (double) search.getMaxScore();
+
+ for (int i = 0; i < search.scoreDocs.length; ++i) {
+ GazateerEntry entry = new GazateerEntry();
+ int docId = search.scoreDocs[i].doc;
+ double sc = search.scoreDocs[i].score;
+
+ entry.getScoreMap().put("lucene", sc);
+
+ entry.getScoreMap().put("rawlucene", sc);
+ entry.setIndexID(docId + "");
+ entry.setSource("geonames");
+
+ Document d = geonamesSearcher.doc(docId);
+ List<IndexableField> fields = d.getFields();
+ for (int idx = 0; idx < fields.size(); idx++) {
+ String value = d.get(fields.get(idx).name());
+ value = value.toLowerCase();
+ switch (idx) {
+ case 1:
+ entry.setItemID(value);
+ break;
+ case 3:
+ entry.setLatitude(Double.valueOf(value));
+ break;
+ case 4:
+ entry.setLongitude(Double.valueOf(value));
+ break;
+ case 10:
+ entry.setItemType(value);
+ break;
+ case 12:
+ entry.setItemParentID(value);
+ break;
+ case 23:
+ entry.setItemName(value);
+ break;
+ }
+ entry.getIndexData().put(fields.get(idx).name(), value);
+ }
+ //only keep it if the country code is a match
+ if (entry.getItemParentID().toLowerCase().equals(code.toLowerCase())) {
+ linkedData.add(entry);
+ }
+ }
+
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
+ } catch (IOException | ParseException ex) {
+ System.err.println(ex);
+ }
+ return linkedData;
+ }
+
+ public ArrayList<GazateerEntry> usgsFind(String searchString, int rowsReturned, EntityLinkerProperties properties) {
+ ArrayList<GazateerEntry> linkedData = new ArrayList<>();
+ try {
+
+
+ if (usgsIndex == null) {
+ String indexloc = properties.getProperty("opennlp.geoentitylinker.gaz.usgs", "");
+ String cutoff = properties.getProperty("opennlp.geoentitylinker.gaz.lucenescore.min", ".75");
+ scoreCutoff = Double.valueOf(cutoff);
+ usgsIndex = new MMapDirectory(new File(indexloc));
+ usgsReader = DirectoryReader.open(usgsIndex);
+ usgsSearcher = new IndexSearcher(usgsReader);
+ usgsAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
+ }
+
+ String luceneQueryString = "FEATURE_NAME:" + searchString + " OR MAP_NAME: " + searchString;
+ QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, usgsAnalyzer);
+ Query q = parser.parse(luceneQueryString);
+
+
+ TopDocs search = usgsSearcher.search(q, rowsReturned);
+ double maxScore = (double) search.getMaxScore();
+
+
+ for (int i = 0; i < search.scoreDocs.length; ++i) {
+ GazateerEntry entry = new GazateerEntry();
+ int docId = search.scoreDocs[i].doc;
+ double sc = search.scoreDocs[i].score;
+ //keep track of the min score for normalization
+
+ entry.getScoreMap().put("lucene", sc);
+ entry.getScoreMap().put("rawlucene", sc);
+ entry.setIndexID(docId + "");
+ entry.setSource("usgs");
+ entry.setItemParentID("us");
+
+
+ Document d = usgsSearcher.doc(docId);
+ List<IndexableField> fields = d.getFields();
+ for (int idx = 0; idx < fields.size(); idx++) {
+ String value = d.get(fields.get(idx).name());
+ value = value.toLowerCase();
+ switch (idx) {
+ case 0:
+ entry.setItemID(value);
+ break;
+ case 1:
+ entry.setItemName(value);
+ break;
+ case 2:
+ entry.setItemType(value);
+ break;
+ case 9:
+ entry.setLatitude(Double.valueOf(value));
+ break;
+ case 10:
+ entry.setLongitude(Double.valueOf(value));
+ break;
+ }
+ entry.getIndexData().put(fields.get(idx).name(), value);
+ }
+ linkedData.add(entry);
+
+
+ }
+
+ normalize(linkedData, 0d, maxScore);
+ prune(linkedData);
+ } catch (IOException | ParseException ex) {
+ System.err.println(ex);
+ }
+
+ return linkedData;
+ }
+
+ private void normalize(ArrayList<GazateerEntry> linkedData, Double minScore, Double maxScore) {
+ for (GazateerEntry gazateerEntry : linkedData) {
+
+ double luceneScore = gazateerEntry.getScoreMap().get("lucene");
+ luceneScore = normalize(luceneScore, minScore, maxScore);
+ luceneScore = luceneScore > 1.0 ? 1.0 : luceneScore;
+ luceneScore = (luceneScore == Double.NaN) ? 0.001 : luceneScore;
+ gazateerEntry.getScoreMap().put("lucene", luceneScore);
+ }
+ }
+
+ private void prune(ArrayList<GazateerEntry> linkedData) {
+ for (Iterator<GazateerEntry> itr = linkedData.iterator(); itr.hasNext();) {
+ GazateerEntry ge = itr.next();
+ if (ge.getScoreMap().get("lucene") < scoreCutoff) {
+ itr.remove();
+ }
+ }
+ }
+
+ private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,129 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.EntityLinker;
+/**
+ * Links location entities to gazatteers. Currently supports gazateers in a
+ * MySql database (NGA and USGS)
+ *
+ *
+ */
+public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
+
+ // CountryProximityScorer scorer = new CountryProximityScorer();
+// private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();
+// private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();
+ private CountryContext countryContext;
+ private Map<String, Set<Integer>> countryMentions;
+ private EntityLinkerProperties linkerProperties;
+ private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+ /**
+ * Flag for deciding whether to search gaz only for toponyms within countries
+ * that are mentioned in the document
+ */
+ private Boolean filterCountryContext = true;
+
+ public GeoEntityLinker() {
+ countryContext = new CountryContext();
+ }
+
+ @Override
+ public List<LinkedSpan> find(String doctext, Span[] sentences, String[][] tokensBySentence, Span[][] namesBySentence) {
+ ArrayList<LinkedSpan> spans = new ArrayList<LinkedSpan>();
+
+ if (linkerProperties == null) {
+ throw new IllegalArgumentException("EntityLinkerProperties cannot be null");
+ }
+ countryMentions = countryContext.regexfind(doctext, linkerProperties);
+
+ for (int s = 0; s < sentences.length; s++) {
+ Span[] names = namesBySentence[s];
+ String[] tokens = tokensBySentence[s];
+ String[] matches = Span.spansToStrings(names, tokens);
+
+ for (int i = 0; i < matches.length; i++) {
+
+//nga gazateer is for other than US placenames, don't use it unless US is a mention in the document
+ ArrayList<BaseLink> geoNamesEntries = new ArrayList<BaseLink>();
+ if (!(countryMentions.keySet().contains("us") && countryMentions.keySet().size() == 1) || countryMentions.keySet().size() > 1 || countryMentions.keySet().isEmpty()) {
+ // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
+ for (String code : countryMentions.keySet()) {
+ if (!code.equals("us")) {
+ geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code, linkerProperties));
+ }
+ }
+
+ }
+ ArrayList<BaseLink> usgsEntries = new ArrayList<BaseLink>();
+ if (countryMentions.keySet().contains("us") || countryMentions.keySet().isEmpty()) {
+ //usgsEntries = usgsGaz.find(matches[i], names[i], linkerProperties);
+ usgsEntries.addAll(gazateerSearcher.usgsFind(matches[i], 3, linkerProperties));
+ }
+ LinkedSpan<BaseLink> geoSpan = new LinkedSpan<BaseLink>(geoNamesEntries, names[i].getStart(), names[i].getEnd());
+
+ if (!usgsEntries.isEmpty()) {
+ geoSpan.getLinkedEntries().addAll(usgsEntries);
+ geoSpan.setSearchTerm(matches[i]);
+ }
+
+ if (!geoSpan.getLinkedEntries().isEmpty()) {
+ geoSpan.setSearchTerm(matches[i]);
+ geoSpan.setSentenceid(s);
+ spans.add(geoSpan);
+ }
+ }
+ }
+
+ List<LinkedEntityScorer<CountryContext>> scorers = new ArrayList<>();
+ scorers.add(new FuzzyStringMatchScorer());
+ scorers.add(new GeoHashBinningScorer());
+ scorers.add(new CountryProximityScorer());
+
+ for (LinkedEntityScorer scorer : scorers) {
+ scorer.score(spans, doctext, sentences, countryContext);
+ }
+ return spans;
+ }
+
+ @Override
+ public void setEntityLinkerProperties(EntityLinkerProperties properties) {
+ this.linkerProperties = properties;
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, Span[] tokens, Span[] nameSpans, int sentenceIndex) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+
+ @Override
+ public List<LinkedSpan> find(String text, Span[] sentences, String[] tokens, Span[] nameSpans) {
+ throw new UnsupportedOperationException("The GeoEntityLinker requires the entire document for proper scoring. This method is unsupported"); //To change body of generated methods, choose Tools | Templates.
+ }
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,275 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *Scores toponymns based on geographic point binning (clustering). This classes output is highly dependant on the quality
+ * of points returned from the gazateer. False positive hits from the index will pollute this result. Ensure the score cutoff for the
+ * Lucene search is set to an appropriate level so this class if not fed poor data.
+ */
+public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+ score( linkedSpans);
+ }
+
+ private void score(List<LinkedSpan> geospans) {
+ Map<Double, Double> latLongs = new HashMap<Double, Double>();
+
+ /**
+ * collect all the lat longs
+ */
+ for (LinkedSpan<BaseLink> ls : geospans) {
+ for (BaseLink bl : ls.getLinkedEntries()) {
+ if (bl instanceof GazateerEntry) {
+ GazateerEntry entry = (GazateerEntry) bl;
+ latLongs.put(entry.getLatitude(), entry.getLongitude());
+
+ }
+ }
+ }
+
+ /**
+ * convert to geohash and add to sortedset
+ */
+ TreeSet<Long> geoHashes = new TreeSet<Long>();
+ for (Map.Entry<Double, Double> entry : latLongs.entrySet()) {
+ geoHashes.add(geoHash(entry.getKey(), entry.getValue()));
+ }
+ /**
+ * bin the points and generate a scoremap
+ */
+ Map<Long, Set<Long>> bins = bin(geoHashes);
+ Map<Long, Double> scores = getScore((TreeMap<Long, Set<Long>>) bins);
+ /**
+ * iterate over the data again and assign the score based on the bins
+ */
+ for (LinkedSpan<BaseLink> ls : geospans) {
+ for (BaseLink bl : ls.getLinkedEntries()) {
+ Long geohash = -1L;
+ Double score = 0d;
+ if (bl instanceof GazateerEntry) {
+ GazateerEntry entry = (GazateerEntry) bl;
+ geohash = geoHash(entry.getLatitude(), entry.getLongitude());
+
+ }
+ if (scores.containsKey(geohash)) {
+ score = scores.get(geohash);
+
+ } else {
+ for (Long bin : bins.keySet()) {
+ if (bin == geohash || bins.get(bin).contains(geohash)) {
+ score = scores.get(bin);
+ break;
+ }
+ }
+ }
+ bl.getScoreMap().put("geohashbin", score);
+ }
+ }
+
+
+ }
+
+ private Long normalize(Double coordpart, Boolean isLat) {
+ Integer add = isLat ? 90 : 180;
+ coordpart = Math.abs(coordpart + add);
+ coordpart = coordpart * 1000000;
+
+ Long l = Math.round(coordpart);
+ String coord = String.valueOf(l);
+ if (coord.length() < 8) {
+ while (coord.length() < 8) {
+ coord += "0";
+ }
+ }
+ coord = coord.substring(0, 8);
+ l = Long.valueOf(coord);
+ return l;
+ }
+
+ /**
+ * interleaves a lat and a long to place the coordinate in linear sortable
+ * space for binning simplicity
+ *
+ * @param lat
+ * @param lon
+ * @return
+ */
+ private Long geoHash(double lat, double lon) {
+ Long normLat = normalize(lat, Boolean.TRUE);
+ Long normLon = normalize(lon, Boolean.FALSE);
+ String sLat = String.valueOf(normLat);
+ String sLon = String.valueOf(normLon);
+ char[] latInts = sLat.toCharArray();
+ char[] lonInts = sLon.toCharArray();
+ String geoHash = "";
+ int len = latInts.length > lonInts.length ? lonInts.length : latInts.length;
+ for (int i = 0; i < len - 1; i++) {
+ String a = String.valueOf(latInts[i]);
+ String b = String.valueOf(lonInts[i]);
+ geoHash += a + b;
+ }
+
+ return Long.valueOf(geoHash);
+ }
+
+ private Map<Long, Set<Long>> bin(TreeSet<Long> sets) {
+ ArrayList<Long> list = new ArrayList<Long>(sets);
+ ArrayList<Long> diffs = new ArrayList<Long>();
+ /**
+ * create a set of differences between the points
+ */
+ for (int i = 0; i < list.size() - 1; i++) {
+ Long n = list.get(i + 1);
+ Long v = list.get(i);
+ diffs.add(Math.abs(n - v));
+ }
+ /**
+ * generate an average "distance" between the normed points
+ */
+ Long sum = 0L;
+ for (Long l : diffs) {
+ sum += l;
+ }
+ Long avg=sum;
+ if(!diffs.isEmpty()){
+ avg = sum / diffs.size();
+ }
+
+
+ /**
+ * generate break values where the disparity is greater than the average
+ */
+ TreeSet<Long> breaks = new TreeSet<Long>();
+ for (int i = 0; i < list.size() - 1; i++) {
+ Long n = list.get(i + 1);
+ Long v = list.get(i);
+ //Long percent = 100 - (v / n * 100);
+ Long diff = n - v;
+ if (diff > avg) {
+ breaks.add(v);
+ }
+ }
+ /**
+ * based on the break values, place subsets of close points into bins
+ */
+ TreeMap<Long, Set<Long>> binToAmount = new TreeMap<Long, Set<Long>>();
+ Long lastBreak = -1L;
+ for (Long br : breaks) {
+ if (lastBreak == -1L) {
+ binToAmount.put(br, sets.subSet(0L, true, br, true));
+ } else {
+ binToAmount.put(br, sets.subSet(lastBreak, false, br, true));
+ }
+ lastBreak = br;
+ }
+ lastBreak = sets.higher(lastBreak);
+ if (lastBreak != null) {
+ binToAmount.put(lastBreak, sets.subSet(lastBreak, true, sets.last(), true));
+ if (binToAmount.get(lastBreak).isEmpty()) {
+ binToAmount.get(lastBreak).add(lastBreak);
+ }
+ }
+ /**
+ * "binToAmount" is a map of the break value to all the points behind it
+ * (it's sorted), so the key is the max value of its set of values
+ */
+ return binToAmount;
+ }
+
+ /**
+ * returns a map of geohashes and their score
+ *
+ * @param binToAmount
+ * @return Map< Geohash, score>
+ */
+ private Map<Long, Double> getScore(TreeMap<Long, Set<Long>> binToAmount) {
+ TreeMap<Long, Double> ranks = new TreeMap<Long, Double>();
+ TreeMap<Long, Double> normRanks = new TreeMap<Long, Double>();
+ /**
+ * if there is only one bin return 1 as the rank for each item in the value
+ */
+ if (binToAmount.keySet().size() == 1 || binToAmount.keySet().isEmpty()) {
+ for (Long bin : binToAmount.keySet()) {
+ for (Long hash : binToAmount.get(bin)) {
+ ranks.put(bin, 1d);
+ }
+ }
+ return ranks;
+ }
+ int total = 0;
+ /**
+ * generate a total number of points
+ */
+ for (Set<Long> geohashes : binToAmount.values()) {
+ total += geohashes.size();
+ }
+ /**
+ * divide total by bin size, largest bin size gets best score, everything in
+ * that bin gets that score because it is part of that primary cluster
+ * TODO... do an extra iteration of clustering within the predominant
+ * cluster to refine the scoring or make the basis of the binning more
+ * granular than > avg
+ */
+ TreeSet<Double> rankSet = new TreeSet<Double>();
+ for (Long key : binToAmount.keySet()) {
+ int size = binToAmount.get(key).size();
+ Double rank = (double) total / size;
+ rankSet.add(rank);
+ ranks.put(key, rank);
+ }
+ /**
+ * load the final score map with normalized values
+ */
+ for (Map.Entry<Long, Double> rank : ranks.entrySet()) {
+ double norm = normalize(rank.getValue(), rankSet.first() + .1, rankSet.last() + .1);
+ double reverse = Math.abs(norm - 1);
+ double score = reverse > 1d ? 1.0 : reverse;
+ normRanks.put(rank.getKey(), score);
+ }
+
+ return normRanks;
+ }
+
+ /**
+ * transposes a number in a range to a double between 0 and 1
+ *
+ * @param valueToNormalize the value to be normalized (placed within a new
+ * range of 0-1)
+ * @param minimum the min of the current range
+ * @param maximum the max of the current range
+ * @return
+ */
+ private Double normalize(Double valueToNormalize, Double minimum, Double maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
+
Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.util.List;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Structure for scoring linked entities. The Map logically represents a pair :
+ * "Score type" to the "actual Score."
+ */
+public interface LinkedEntityScorer<T> {
+
+/**
+ * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
+ * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
+ * @param docText the full text of the document.
+ * @param sentenceSpans the sentence spans the correspond to the document text
+ * @param additionalContext any additional data required to perform the scoring operation
+ * @return void
+ */
+ void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, T additionalContext);
+}
Added: opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java?rev=1539319&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/test/java/apache/opennlp/addons/AppTest.java Wed Nov 6 11:47:37 2013
@@ -0,0 +1,38 @@
+package apache.opennlp.addons;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest
+ extends TestCase
+{
+ /**
+ * Create the test case
+ *
+ * @param testName name of the test case
+ */
+ public AppTest( String testName )
+ {
+ super( testName );
+ }
+
+ /**
+ * @return the suite of tests being tested
+ */
+ public static Test suite()
+ {
+ return new TestSuite( AppTest.class );
+ }
+
+ /**
+ * Rigourous Test :-)
+ */
+ public void testApp()
+ {
+ assertTrue( true );
+ }
+}