You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/04/24 13:20:19 UTC
[2/4] opennlp-addons git commit: Fix checkstyle errors in
geoentitylinker
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
index f457822..027efc2 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/RegionProcessor.java
@@ -1,113 +1,115 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.util.ArrayList;
-import java.util.List;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.IndexWriter;
-
-public class RegionProcessor {
-
- public static void main(String[] args) {
- RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
- }
-
- /**
- *
- * @param regionsFile the file that stores Region references. the format of
- * this file is tab delimitted text with index 0 as the name of the region,
- * index 1 as the longitude, and index 2 as the latitude
- * @param outputCountryContextfile this is the country context files shared by
- * all indexing processors
- * @param w
- */
- public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
- try {
- readFile(regionsFile, outputCountryContextfile, w);
- } catch (Exception ex) {
- ex.printStackTrace();
- }
- }
-
- public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {
- List<String> ccfileentries = new ArrayList<>();
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<>();
- int counter = 0;
- System.out.println("reading gazetteer data from Regions file...........");
- String line = "";
- while ((line = reader.readLine()) != null) {
-
- String[] values = line.split("\t");
- if (counter == 0) {
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
- }
- String placeName = values[0];
- String lat = values[2];
- String lon = values[1];
- String dsg = "region";
- String id = "rg" + counter;
-
- String hierarchy = placeName;
-
- doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
- doc.add(new TextField("placename", placeName, Field.Store.YES));
- doc.add(new StringField("latitude", lat, Field.Store.YES));
- doc.add(new StringField("longitude", lon, Field.Store.YES));
- doc.add(new StringField("loctype", dsg, Field.Store.YES));
- doc.add(new StringField("admincode", "", Field.Store.YES));
- doc.add(new StringField("countrycode", id, Field.Store.YES));
- doc.add(new StringField("countycode", "", Field.Store.YES));
-
- doc.add(new StringField("locid", id, Field.Store.YES));
- doc.add(new StringField("gazsource", "region", Field.Store.YES));
- //countrycontext file format
- // US KY 131 United States Kentucky Leslie
-
- ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("
- + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n");
- if (w != null) {
- w.addDocument(doc);
- }
- }
- counter++;
-
- }
- if (w != null) {
- w.commit();
- }
- FileWriter writer = new FileWriter(outputCountryContextfile, true);
- for (String string : ccfileentries) {
- writer.write(string);
- }
- System.out.println("successfully wrote Region entries to country oontext file");
- writer.close();
- System.out.println("Completed indexing regions!");
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+public class RegionProcessor {
+
+ public static void main(String[] args) {
+ RegionProcessor.process(new File("C:\\temp\\gazetteers\\regions.txt"), new File("C:\\temp\\gazetteers\\testRegionContext.txt"), null);
+ }
+
+ /**
+ *
+ * @param regionsFile the file that stores Region references. the format of
+ * this file is tab delimitted text with index 0 as the name of the region,
+ * index 1 as the longitude, and index 2 as the latitude
+ * @param outputCountryContextfile this is the country context files shared by
+ * all indexing processors
+ * @param w
+ */
+ public static void process(File regionsFile, File outputCountryContextfile, IndexWriter w) {
+ try {
+ readFile(regionsFile, outputCountryContextfile, w);
+ } catch (Exception ex) {
+ ex.printStackTrace();
+ }
+ }
+
+ public static void readFile(File gazateerInputData, File outputCountryContextfile, IndexWriter w) throws Exception {
+ List<String> ccfileentries = new ArrayList<>();
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<>();
+ int counter = 0;
+ System.out.println("reading gazetteer data from Regions file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split("\t");
+ if (counter == 0) {
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ String placeName = values[0];
+ String lat = values[2];
+ String lon = values[1];
+ String dsg = "region";
+ String id = "rg" + counter;
+
+ String hierarchy = placeName;
+
+ doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new StringField("latitude", lat, Field.Store.YES));
+ doc.add(new StringField("longitude", lon, Field.Store.YES));
+ doc.add(new StringField("loctype", dsg, Field.Store.YES));
+ doc.add(new StringField("admincode", "", Field.Store.YES));
+ doc.add(new StringField("countrycode", id, Field.Store.YES));
+ doc.add(new StringField("countycode", "", Field.Store.YES));
+
+ doc.add(new StringField("locid", id, Field.Store.YES));
+ doc.add(new StringField("gazsource", "region", Field.Store.YES));
+ //countrycontext file format
+ // US KY 131 United States Kentucky Leslie
+
+ ccfileentries.add(id + "\t" + id + "\t" + id + "\t" + placeName + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\t" + "("
+ + placeName + ")" + "\t" + "NO_DATA_FOUND" + "\t" + "NO_DATA_FOUND" + "\n");
+ if (w != null) {
+ w.addDocument(doc);
+ }
+ }
+ counter++;
+
+ }
+ if (w != null) {
+ w.commit();
+ }
+ FileWriter writer = new FileWriter(outputCountryContextfile, true);
+ for (String string : ccfileentries) {
+ writer.write(string);
+ }
+ System.out.println("successfully wrote Region entries to country oontext file");
+ writer.close();
+ System.out.println("Completed indexing regions!");
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
index fcd61c1..61b2120 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java
@@ -1,251 +1,254 @@
-/*
- * Copyright 2014 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.indexing;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.addons.geoentitylinker.AdminBoundary;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
-
-import org.apache.lucene.index.IndexWriter;
-
-public class USGSProcessor {
-
- public static void main(String[] args) {
- try {
- Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS);
- process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
- } catch (Exception ex) {
- Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);
- }
- }
-
- public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception {
- Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);
- readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
- writeCountryContextFile(outputCountryContextfile, provData);
- }
-
- public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {
-
- Map<String, StateCentroid> states = new HashMap<>();
- BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
- List<String> fields = new ArrayList<>();
- int counter = 0;
- System.out.println("reading gazetteer data from USGS file...........");
- String line = "";
- while ((line = reader.readLine()) != null) {
-
- String[] values = line.split(type.getSeparator());
- if (counter == 0) {
- for (String columnName : values) {
- fields.add(columnName.replace("��", "").trim());
- }
-
- } else {
- Document doc = new Document();
- for (int i = 0; i < fields.size() - 1; i++) {
- doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
- }
- String placeName = values[1];
- String lat = values[9];
- String lon = values[10];
- String dsg = values[2];
- String id = values[0];
-
- String ccode = values[6];
- String admincode = values[3];
- AdminBoundary get = lookupMap.get(admincode + "." + ccode);
- String countyname = "";
- if (get == null) {
- System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName);
- continue;
-
- }
- String countyCode = get.getCountyCode();
-
- if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
- countyname = get.getCountyName();
- }
- if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
- countyCode = get.getCountyCode();
- }
- String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;
-
- if (states.containsKey(get.getProvinceName())) {
- StateCentroid entry = states.get(get.getProvinceName());
- entry.count++;
- entry.latSum += Double.valueOf(lat);
- entry.longSum += Double.valueOf(lon);
- } else {
- StateCentroid centroid = new StateCentroid();
- centroid.statecode = get.getProvCode();
- centroid.count = 1;
- centroid.latSum = Double.valueOf(lat);
- centroid.longSum = Double.valueOf(lon);
- states.put(get.getProvinceName(), centroid);
- }
-
- doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
- doc.add(new TextField("placename", placeName, Field.Store.YES));
- doc.add(new TextField("latitude", lat, Field.Store.YES));
- doc.add(new TextField("longitude", lon, Field.Store.YES));
- doc.add(new StringField("loctype", dsg, Field.Store.YES));
- doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));
- doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));
- doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
-
- doc.add(new StringField("locid", id, Field.Store.YES));
- doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
- w.addDocument(doc);
- }
- counter++;
- if (counter % 100000 == 0) {
- w.commit();
- System.out.println(counter + " .........USGS entries committed to index..............");
- }
-
- }
-
- for (String state : states.keySet()) {
- StateCentroid get = states.get(state);
- Document doc = new Document();
- doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
- doc.add(new TextField("placename", state, Field.Store.YES));
- //calculate a centroid for all the points that were in the state
- doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
- doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
- doc.add(new StringField("loctype", "adm1", Field.Store.YES));
- doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
- doc.add(new StringField("countrycode", "us", Field.Store.YES));
- doc.add(new StringField("countycode", "", Field.Store.YES));
-
- doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
- doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
- w.addDocument(doc);
-
- // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
- }
- Document doc = new Document();
- doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
- doc.add(new TextField("placename", "united states", Field.Store.YES));
- //calculate a centroid for all the points that were in the state
- doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
- doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
- doc.add(new StringField("loctype", "pcli", Field.Store.YES));
- doc.add(new StringField("admincode", "", Field.Store.YES));
- doc.add(new StringField("countrycode", "us", Field.Store.YES));
- doc.add(new StringField("countycode", "", Field.Store.YES));
-
- doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
- doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
- //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));
-
- w.addDocument(doc);
- w.commit();
-
- System.out.println("Completed indexing USGS gaz!");
- }
-
- private static class StateCentroid {
-
- double latSum;
- double longSum;
- String statecode;
- int count;
- }
-
- private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
- System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
- Map<String, AdminBoundary> outmap = new HashMap<>();
- BufferedReader reader;
-
- try {
-
- reader = new BufferedReader(new FileReader(govUnitsFile));
- int i = 0;
- String line = "";
- String[] fields = null;
- while ((line = reader.readLine()) != null) {
-
- String[] values = line.split(type.getSeparator());
- if (i == 0) {
- fields = values;
- i++;
- continue;
- }
- i++;
- // System.out.println(i);
- String countyCode = values[2];
- String countyName = values[3];
- String stateCode = values[5];
- String stateName = values[6];
- String countryCode = values[7];
- String countryName = values[8];
- AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName, null, null, null);
- outmap.put(stateCode + "." + countyCode, adminBoundary);
- // System.out.println(adminBoundary);
-
- }
- reader.close();
- } catch (IOException ex) {
- ex.printStackTrace();
- }
- System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());
-
- return outmap;
-
- }
-
- public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
- // FileWriter writer = null;
- try (FileWriter writer = new FileWriter(outfile, true)) {
-
- for (String admkey : adms.keySet()) {
- AdminBoundary adm = adms.get(admkey);
- if (adm == null) {
- continue;
- }
- String province = adm.getProvinceName();
- String country = adm.getCountryName();
- /**
- * this is the standard format of the country context file... Geonames
- * data will have an empty string for the county
- */
- String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t"
- + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
- writer.write(line);
- /// System.out.println(line);
-
- }
- } catch (IOException ex) {
- Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
- }
- System.out.println("successfully wrote USGS entries to country context file");
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.indexing;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+
+import opennlp.addons.geoentitylinker.AdminBoundary;
+
+public class USGSProcessor {
+
+ public static void main(String[] args) {
+ try {
+ Map<String, AdminBoundary> provData = getProvData(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), GazetteerIndexer.GazType.USGS);
+ process(new File("C:\\temp\\gazetteers\\usgsdata\\GOVT_UNITS_20140601.txt"), new File("C:\\temp\\gazetteers\\usgsdata\\NationalFile_20140601.txt"), null, null);
+ } catch (Exception ex) {
+ Logger.getLogger(USGSProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ public static void process(File lookupData, File usgsGazDataFile, File outputCountryContextfile, IndexWriter w) throws Exception {
+ Map<String, AdminBoundary> provData = getProvData(lookupData, GazetteerIndexer.GazType.USGS);
+ readFile(usgsGazDataFile, w, GazetteerIndexer.GazType.USGS, provData);
+ writeCountryContextFile(outputCountryContextfile, provData);
+ }
+
+ public static void readFile(File gazateerInputData, IndexWriter w, GazetteerIndexer.GazType type, Map<String, AdminBoundary> lookupMap) throws Exception {
+
+ Map<String, StateCentroid> states = new HashMap<>();
+ BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
+ List<String> fields = new ArrayList<>();
+ int counter = 0;
+ System.out.println("reading gazetteer data from USGS file...........");
+ String line = "";
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split(type.getSeparator());
+ if (counter == 0) {
+ for (String columnName : values) {
+ fields.add(columnName.replace("��", "").trim());
+ }
+
+ } else {
+ Document doc = new Document();
+ for (int i = 0; i < fields.size() - 1; i++) {
+ doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
+ }
+ String placeName = values[1];
+ String lat = values[9];
+ String lon = values[10];
+ String dsg = values[2];
+ String id = values[0];
+
+ String ccode = values[6];
+ String admincode = values[3];
+ AdminBoundary get = lookupMap.get(admincode + "." + ccode);
+ String countyname = "";
+ if (get == null) {
+ System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName);
+ continue;
+
+ }
+ String countyCode = get.getCountyCode();
+
+ if (!get.getCountyName().equals("NO_DATA_FOUND_VALUE")) {
+ countyname = get.getCountyName();
+ }
+ if (!get.getCountyCode().equals("NO_DATA_FOUND_VALUE")) {
+ countyCode = get.getCountyCode();
+ }
+ String hierarchy = get.getCountryName() + ", " + get.getProvinceName() + ", " + countyname + ", " + placeName;
+
+ if (states.containsKey(get.getProvinceName())) {
+ StateCentroid entry = states.get(get.getProvinceName());
+ entry.count++;
+ entry.latSum += Double.valueOf(lat);
+ entry.longSum += Double.valueOf(lon);
+ } else {
+ StateCentroid centroid = new StateCentroid();
+ centroid.statecode = get.getProvCode();
+ centroid.count = 1;
+ centroid.latSum = Double.valueOf(lat);
+ centroid.longSum = Double.valueOf(lon);
+ states.put(get.getProvinceName(), centroid);
+ }
+
+ doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
+ doc.add(new TextField("placename", placeName, Field.Store.YES));
+ doc.add(new TextField("latitude", lat, Field.Store.YES));
+ doc.add(new TextField("longitude", lon, Field.Store.YES));
+ doc.add(new StringField("loctype", dsg, Field.Store.YES));
+ doc.add(new StringField("admincode", (get.getCountryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));
+ doc.add(new StringField("countrycode", get.getCountryCode().toLowerCase(), Field.Store.YES));
+ doc.add(new StringField("countycode", (get.getCountryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));
+
+ doc.add(new StringField("locid", id, Field.Store.YES));
+ doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+ w.addDocument(doc);
+ }
+ counter++;
+ if (counter % 100000 == 0) {
+ w.commit();
+ System.out.println(counter + " .........USGS entries committed to index..............");
+ }
+
+ }
+
+ for (String state : states.keySet()) {
+ StateCentroid get = states.get(state);
+ Document doc = new Document();
+ doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
+ doc.add(new TextField("placename", state, Field.Store.YES));
+ //calculate a centroid for all the points that were in the state
+ doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
+ doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
+ doc.add(new StringField("loctype", "adm1", Field.Store.YES));
+ doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
+ doc.add(new StringField("countrycode", "us", Field.Store.YES));
+ doc.add(new StringField("countycode", "", Field.Store.YES));
+
+ doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
+ doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+ w.addDocument(doc);
+
+ // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
+ }
+ Document doc = new Document();
+ doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
+ doc.add(new TextField("placename", "united states", Field.Store.YES));
+ //calculate a centroid for all the points that were in the state
+ doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
+ doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
+ doc.add(new StringField("loctype", "pcli", Field.Store.YES));
+ doc.add(new StringField("admincode", "", Field.Store.YES));
+ doc.add(new StringField("countrycode", "us", Field.Store.YES));
+ doc.add(new StringField("countycode", "", Field.Store.YES));
+
+ doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
+ doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
+ //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));
+
+ w.addDocument(doc);
+ w.commit();
+
+ System.out.println("Completed indexing USGS gaz!");
+ }
+
+ private static class StateCentroid {
+
+ double latSum;
+ double longSum;
+ String statecode;
+ int count;
+ }
+
+ private static Map<String, AdminBoundary> getProvData(File govUnitsFile, GazetteerIndexer.GazType type) {
+ System.out.println("Attempting to read USGS province (State) data from: " + govUnitsFile.getPath());
+ Map<String, AdminBoundary> outmap = new HashMap<>();
+ BufferedReader reader;
+
+ try {
+
+ reader = new BufferedReader(new FileReader(govUnitsFile));
+ int i = 0;
+ String line = "";
+ String[] fields = null;
+ while ((line = reader.readLine()) != null) {
+
+ String[] values = line.split(type.getSeparator());
+ if (i == 0) {
+ fields = values;
+ i++;
+ continue;
+ }
+ i++;
+ // System.out.println(i);
+ String countyCode = values[2];
+ String countyName = values[3];
+ String stateCode = values[5];
+ String stateName = values[6];
+ String countryCode = values[7];
+ String countryName = values[8];
+ AdminBoundary adminBoundary = new AdminBoundary(countryCode, countryName, stateCode, stateName, countyCode, countyName, null, null, null);
+ outmap.put(stateCode + "." + countyCode, adminBoundary);
+ // System.out.println(adminBoundary);
+
+ }
+ reader.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ System.out.println("Successfully read USGS province (State) data from: " + govUnitsFile.getPath());
+
+ return outmap;
+
+ }
+
+ public static void writeCountryContextFile(File outfile, Map<String, AdminBoundary> adms) {
+ // FileWriter writer = null;
+ try (FileWriter writer = new FileWriter(outfile, true)) {
+
+ for (String admkey : adms.keySet()) {
+ AdminBoundary adm = adms.get(admkey);
+ if (adm == null) {
+ continue;
+ }
+ String province = adm.getProvinceName();
+ String country = adm.getCountryName();
+ /**
+ * this is the standard format of the country context file... Geonames
+ * data will have an empty string for the county
+ */
+ String line = adm.getCountryCode() + "\t" + adm.getProvCode() + "\t" + adm.getCountyCode() + "\t" + country + "\t" + province + "\t" + adm.getCountyName() + "\t"
+ + "(U\\.S\\.[ $]|U\\.S\\.A\\.[ $]|United States|the US[ $]|a us[ $])" + "\t" + adm.getProvinceName() + "\t" + adm.getCountyName() + "\n";
+ writer.write(line);
+ /// System.out.println(line);
+
+ }
+ } catch (IOException ex) {
+ Logger.getLogger(GeonamesProcessor.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ System.out.println("successfully wrote USGS entries to country context file");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
index aea8f9b..98c9715 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/CountryProximityScorer.java
@@ -1,281 +1,283 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.regex.Pattern;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Scores toponyms based on their proximity to a country mention. Based on the
- * heuristic that toponymn mentions are more likely close to their parent
- * country mentions. For instance, if the toponym Berlin is mentioned near an
- * indicator of Germany, it is more likely to be Berlin Germany than Berlin
- * Connecticut (if Connecticut is mentioned further down in the article).
- *
- *
- */
-public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {
-
- private Map<String, Set<String>> nameCodesMap;
- String dominantCode = "";
- private Map<String, String> regexMap = new HashMap<>();
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
-
- regexMap = additionalContext.getCountryRegexMap();
- score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
-
- }
-
- /**
- * Assigns a score to each BaseLink in each linkedSpan's set of N best
- * matches. Currently the scoring indicates the probability that the toponym
- * is correct based on the country context in the document
- *
- * @param linkedData the linked spans, holds the Namefinder results, and the
- * list of BaseLink for each
- * @param countryHits all the country mentions in the document
- * @param nameCodesMap maps a country indicator name to a country code. Used
- * to determine if the namefinder found the same exact toponym the country
- * context did. If so the score is boosted due to the high probability that
- * the NameFinder actually "rediscovered" a country
- * @param docText the full text of the document...not used in this default
- * implementation
- * @param sentences the sentences that correspond to the doc text.
- * @param maxAllowedDist a constant that is used to determine which country
- * mentions, based on proximity within the text, should be used to score the
- * Named Entity.
- * @return
- */
- public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
- this.nameCodesMap = nameCodesMap;
- setDominantCode(countryHits);
- for (LinkedSpan<BaseLink> linkedspan : linkedData) {
-
- linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
- }
- return linkedData;
- }
-
- /**
- * sets class level variable to a code based on the number of mentions
- *
- * @param countryHits
- */
- private void setDominantCode(Map<String, Set<Integer>> countryHits) {
- int hits = -1;
- for (String code : countryHits.keySet()) {
- if (countryHits.get(code).size() > hits) {
- hits = countryHits.get(code).size();
- dominantCode = code;
- }
- }
- }
-
- /**
- * Generates distances from each country mention to the span's location in the
- * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
- * are resolved to the correct country and coordinate.
- *
- * @param sentences
- * @param countryHits
- * @param span
- * @return
- */
- private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
- Double score = 0.0;
- /*
- * get the index of the actual span, begining of sentence //should generate
- * tokens from sentence and create a char offset... //could have large
- * sentences due to poor sentence detection or wonky doc text
- */
- int sentenceIdx = span.getSentenceid();
- int sentIndexInDoc = sentences[sentenceIdx].getStart();
- /**
- * create a map of all the span's proximal country mentions in the document
- * Map< countrycode, set of <distances from this NamedEntity>>
- */
- Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
- //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
- for (String cCode : countryHits.keySet()) {
-//iterate over all the regex start values and calculate an offset
- for (Integer cHit : countryHits.get(cCode)) {
- Integer absDist = Math.abs(sentIndexInDoc - cHit);
- //only include near mentions based on a heuristic
- //TODO make this a property
- // if (absDist < maxAllowedDistance) {
- if (distancesFromCodeMap.containsKey(cCode)) {
- distancesFromCodeMap.get(cCode).add(absDist);
- } else {
- HashSet<Integer> newset = new HashSet<Integer>();
- newset.add(absDist);
- distancesFromCodeMap.put(cCode, newset);
- }
- }
-
- //}
- }
- //we now know how far this named entity is from every country mention in the document
-
- /**
- * the gaz matches that have a country code that have mentions in the doc
- * that are closest to the Named Entity should return the best score.
- * Analyzemap generates a likelihood score that the toponym from the gaz is
- * referring to one of the countries, i.e, Map<countrycode, prob that this
- * span is referring to the toponym form this code key>
- */
- Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
- for (BaseLink link : span.getLinkedEntries()) {
- //getItemParentId is the country code
- String spanCountryCode = link.getItemParentID();
- if (scoreMap.containsKey(spanCountryCode)) {
-
- score = scoreMap.get(spanCountryCode);
- ///does the name extracted match a country name?
- if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {
- //if so, is it the correct country code for that name?
- if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
- //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
- score = (score + .75) > 1.0 ? 1d : (score + .75);
-
- if (link.getItemParentID().equals(dominantCode)) {
- score = (score + .25) > 1.0 ? 1d : (score + .25);
- }
- }
- }
- }
-
- link.getScoreMap().put("countrycontext", score);
- }
- return span;
- }
-
- /**
- * takes a map of distances from the toponym to each country mention and
- * generates a map of scores for each country code. The map is then correlated
- * to the code of the BaseLink parentid for retrieval. Then the score is added
- * to the overall list.
- *
- * @param distanceMap
- * @param sentences
- * @param span
- * @return
- */
- private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
-
- Map<String, Double> scoreMap = new HashMap<String, Double>();
- if (distanceMap.isEmpty()) {
- return scoreMap;
- }
- TreeSet<Integer> all = new TreeSet<Integer>();
- for (String key : distanceMap.keySet()) {
- all.addAll(distanceMap.get(key));
- }
- //get min max for normalization, this could be more efficient
-
- Integer min = all.first();
- Integer max = all.last();
- if (min == max) {
- min = 0;
- }
- for (String key : distanceMap.keySet()) {
-
- TreeSet<Double> normalizedDistances = new TreeSet<Double>();
- for (Integer i : distanceMap.get(key)) {
- Double norm = normalize(i, min, max);
- //reverse the normed distance so low numbers (closer) are better
- //this could be improved with a "decaying " function using an imcreaseing negative exponent
- Double reverse = Math.abs(norm - 1);
- normalizedDistances.add(reverse);
- }
-
- List<Double> doubles = new ArrayList<Double>(normalizedDistances);
- scoreMap.put(key, slidingDistanceAverage(doubles));
- }
- return scoreMap;
- }
-
- private boolean regexMatch(String placeName, String countryCode) {
- if (regexMap.containsKey(countryCode)) {
- String regexForCountry = regexMap.get(countryCode);
-
- Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
- return p.matcher(placeName.trim()).matches();
- }
- return false;
- }
-
- /**
- * this method is an attempt to make closer clusters of mentions group
- * together to smooth out the average, so one distant outlier does not kill
- * the score for an obviously good hit. More elegant solution is possible
- * using Math.pow, and making the score decay with distance by using an
- * increasing negative exponent (I think)
- *
- * @param normDis the normalized and sorted set of distances as a list
- * @return
- */
- private Double slidingDistanceAverage(List<Double> normDis) {
- List<Double> windowOfAverages = new ArrayList<Double>();
-
- if (normDis.size() < 3) {
- windowOfAverages.addAll(normDis);
- } else {
-
- for (int i = 0; i < normDis.size() - 1; i++) {
- double a = normDis.get(i);
- double b = normDis.get(i + 1);
- windowOfAverages.add((a + b) / 2);
-
- }
- }
- double sum = 0d;
- for (double d : windowOfAverages) {
- sum += d;
- }
- double result = sum / windowOfAverages.size();
- //TODO: ++ prob when large amounts of mentions for a code
- //System.out.println("avg of window:" + result);
- return result;
- }
-
- /**
- * transposes a value within one range to a relative value in a different
- * range. Used to normalize distances in this class.
- *
- * @param valueToNormalize the value to place within the new range
- * @param minimum the min of the set to be transposed
- * @param maximum the max of the set to be transposed
- * @return
- */
- private Double normalize(int valueToNormalize, int minimum, int maximum) {
- Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
- d = d == null ? 0d : d;
- return d;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.regex.Pattern;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponyms based on their proximity to a country mention. Based on the
+ * heuristic that toponymn mentions are more likely close to their parent
+ * country mentions. For instance, if the toponym Berlin is mentioned near an
+ * indicator of Germany, it is more likely to be Berlin Germany than Berlin
+ * Connecticut (if Connecticut is mentioned further down in the article).
+ *
+ *
+ */
+public class CountryProximityScorer implements LinkedEntityScorer<AdminBoundaryContext> {
+
+ private Map<String, Set<String>> nameCodesMap;
+ String dominantCode = "";
+ private Map<String, String> regexMap = new HashMap<>();
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+
+ regexMap = additionalContext.getCountryRegexMap();
+ score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
+
+ }
+
+ /**
+ * Assigns a score to each BaseLink in each linkedSpan's set of N best
+ * matches. Currently the scoring indicates the probability that the toponym
+ * is correct based on the country context in the document
+ *
+ * @param linkedData the linked spans, holds the Namefinder results, and the
+ * list of BaseLink for each
+ * @param countryHits all the country mentions in the document
+ * @param nameCodesMap maps a country indicator name to a country code. Used
+ * to determine if the namefinder found the same exact toponym the country
+ * context did. If so the score is boosted due to the high probability that
+ * the NameFinder actually "rediscovered" a country
+ * @param docText the full text of the document...not used in this default
+ * implementation
+ * @param sentences the sentences that correspond to the doc text.
+ * @param maxAllowedDist a constant that is used to determine which country
+ * mentions, based on proximity within the text, should be used to score the
+ * Named Entity.
+ * @return
+ */
+ public List<LinkedSpan> score(List<LinkedSpan> linkedData, Map<String, Set<Integer>> countryHits, Map<String, Set<String>> nameCodesMap, String docText, Span[] sentences, Integer maxAllowedDist) {
+ this.nameCodesMap = nameCodesMap;
+ setDominantCode(countryHits);
+ for (LinkedSpan<BaseLink> linkedspan : linkedData) {
+
+ linkedspan = simpleProximityAnalysis(sentences, countryHits, linkedspan, maxAllowedDist);
+ }
+ return linkedData;
+ }
+
+ /**
+ * sets class level variable to a code based on the number of mentions
+ *
+ * @param countryHits
+ */
+ private void setDominantCode(Map<String, Set<Integer>> countryHits) {
+ int hits = -1;
+ for (String code : countryHits.keySet()) {
+ if (countryHits.get(code).size() > hits) {
+ hits = countryHits.get(code).size();
+ dominantCode = code;
+ }
+ }
+ }
+
+ /**
+ * Generates distances from each country mention to the span's location in the
+ * doc text. Ultimately an attempt to ensure that ambiguously named toponyms
+ * are resolved to the correct country and coordinate.
+ *
+ * @param sentences
+ * @param countryHits
+ * @param span
+ * @return
+ */
+ private LinkedSpan<BaseLink> simpleProximityAnalysis(Span[] sentences, Map<String, Set<Integer>> countryHits, LinkedSpan<BaseLink> span, Integer maxAllowedDistance) {
+ Double score = 0.0;
+ /*
+ * get the index of the actual span, begining of sentence //should generate
+ * tokens from sentence and create a char offset... //could have large
+ * sentences due to poor sentence detection or wonky doc text
+ */
+ int sentenceIdx = span.getSentenceid();
+ int sentIndexInDoc = sentences[sentenceIdx].getStart();
+ /**
+ * create a map of all the span's proximal country mentions in the document
+ * Map< countrycode, set of <distances from this NamedEntity>>
+ */
+ Map<String, Set<Integer>> distancesFromCodeMap = new HashMap<String, Set<Integer>>();
+ //map = Map<countrycode, Set <of distances this span is from all the mentions of the code>>
+ for (String cCode : countryHits.keySet()) {
+//iterate over all the regex start values and calculate an offset
+ for (Integer cHit : countryHits.get(cCode)) {
+ Integer absDist = Math.abs(sentIndexInDoc - cHit);
+ //only include near mentions based on a heuristic
+ //TODO make this a property
+ // if (absDist < maxAllowedDistance) {
+ if (distancesFromCodeMap.containsKey(cCode)) {
+ distancesFromCodeMap.get(cCode).add(absDist);
+ } else {
+ HashSet<Integer> newset = new HashSet<Integer>();
+ newset.add(absDist);
+ distancesFromCodeMap.put(cCode, newset);
+ }
+ }
+
+ //}
+ }
+ //we now know how far this named entity is from every country mention in the document
+
+ /**
+ * the gaz matches that have a country code that have mentions in the doc
+ * that are closest to the Named Entity should return the best score.
+ * Analyzemap generates a likelihood score that the toponym from the gaz is
+ * referring to one of the countries, i.e, Map<countrycode, prob that this
+ * span is referring to the toponym form this code key>
+ */
+ Map<String, Double> scoreMap = analyzeMap(distancesFromCodeMap, sentences, span);
+ for (BaseLink link : span.getLinkedEntries()) {
+ //getItemParentId is the country code
+ String spanCountryCode = link.getItemParentID();
+ if (scoreMap.containsKey(spanCountryCode)) {
+
+ score = scoreMap.get(spanCountryCode);
+ ///does the name extracted match a country name?
+ if (nameCodesMap.containsKey(link.getItemName().toLowerCase()) || regexMatch(link.getItemName(), link.getItemParentID())) {
+ //if so, is it the correct country code for that name?
+ if (nameCodesMap.get(link.getItemName().toLowerCase()).contains(link.getItemParentID())) {
+ //boost the score becuase it is likely that this is the location in the text, so add 50% to the score or set to 1
+ score = (score + .75) > 1.0 ? 1d : (score + .75);
+
+ if (link.getItemParentID().equals(dominantCode)) {
+ score = (score + .25) > 1.0 ? 1d : (score + .25);
+ }
+ }
+ }
+ }
+
+ link.getScoreMap().put("countrycontext", score);
+ }
+ return span;
+ }
+
+ /**
+ * takes a map of distances from the toponym to each country mention and
+ * generates a map of scores for each country code. The map is then correlated
+ * to the code of the BaseLink parentid for retrieval. Then the score is added
+ * to the overall list.
+ *
+ * @param distanceMap
+ * @param sentences
+ * @param span
+ * @return
+ */
+ private Map<String, Double> analyzeMap(Map<String, Set<Integer>> distanceMap, Span[] sentences, LinkedSpan<BaseLink> span) {
+
+ Map<String, Double> scoreMap = new HashMap<String, Double>();
+ if (distanceMap.isEmpty()) {
+ return scoreMap;
+ }
+ TreeSet<Integer> all = new TreeSet<Integer>();
+ for (String key : distanceMap.keySet()) {
+ all.addAll(distanceMap.get(key));
+ }
+ //get min max for normalization, this could be more efficient
+
+ Integer min = all.first();
+ Integer max = all.last();
+ if (min == max) {
+ min = 0;
+ }
+ for (String key : distanceMap.keySet()) {
+
+ TreeSet<Double> normalizedDistances = new TreeSet<Double>();
+ for (Integer i : distanceMap.get(key)) {
+ Double norm = normalize(i, min, max);
+ //reverse the normed distance so low numbers (closer) are better
+ //this could be improved with a "decaying " function using an imcreaseing negative exponent
+ Double reverse = Math.abs(norm - 1);
+ normalizedDistances.add(reverse);
+ }
+
+ List<Double> doubles = new ArrayList<Double>(normalizedDistances);
+ scoreMap.put(key, slidingDistanceAverage(doubles));
+ }
+ return scoreMap;
+ }
+
+ private boolean regexMatch(String placeName, String countryCode) {
+ if (regexMap.containsKey(countryCode)) {
+ String regexForCountry = regexMap.get(countryCode);
+
+ Pattern p = Pattern.compile(regexForCountry,Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
+ return p.matcher(placeName.trim()).matches();
+ }
+ return false;
+ }
+
+ /**
+ * this method is an attempt to make closer clusters of mentions group
+ * together to smooth out the average, so one distant outlier does not kill
+ * the score for an obviously good hit. More elegant solution is possible
+ * using Math.pow, and making the score decay with distance by using an
+ * increasing negative exponent (I think)
+ *
+ * @param normDis the normalized and sorted set of distances as a list
+ * @return
+ */
+ private Double slidingDistanceAverage(List<Double> normDis) {
+ List<Double> windowOfAverages = new ArrayList<Double>();
+
+ if (normDis.size() < 3) {
+ windowOfAverages.addAll(normDis);
+ } else {
+
+ for (int i = 0; i < normDis.size() - 1; i++) {
+ double a = normDis.get(i);
+ double b = normDis.get(i + 1);
+ windowOfAverages.add((a + b) / 2);
+
+ }
+ }
+ double sum = 0d;
+ for (double d : windowOfAverages) {
+ sum += d;
+ }
+ double result = sum / windowOfAverages.size();
+ //TODO: ++ prob when large amounts of mentions for a code
+ //System.out.println("avg of window:" + result);
+ return result;
+ }
+
+ /**
+ * transposes a value within one range to a relative value in a different
+ * range. Used to normalize distances in this class.
+ *
+ * @param valueToNormalize the value to place within the new range
+ * @param minimum the min of the set to be transposed
+ * @param maximum the max of the set to be transposed
+ * @return
+ */
+ private Double normalize(int valueToNormalize, int minimum, int maximum) {
+ Double d = (double) ((1 - 0) * (valueToNormalize - minimum)) / (maximum - minimum) + 0;
+ d = d == null ? 0d : d;
+ return d;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
index e9634d9..abe5438 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/FuzzyStringMatchScorer.java
@@ -1,123 +1,125 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.addons.geoentitylinker.GazetteerEntry;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- *
- * Generates scores based on string comparisons levenstein and dice
- */
-public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> {
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
- for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
- for (BaseLink link : linkedSpan.getLinkedEntries()) {
- if (link instanceof GazetteerEntry) {
- GazetteerEntry entry = (GazetteerEntry) link;
- String hierarchy = entry.getHierarchy();
- if (hierarchy != null) {
- Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);
- link.getScoreMap().put("hierarchydicecoef", dice);
- Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase());
- link.getScoreMap().put("hierarchylevenshtein", ld);
- }
- String placename = entry.getItemName().toLowerCase();
- if (placename != null) {
- Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
- link.getScoreMap().put("placenamedicecoef", dice);
-
- }
- }
- }
- }
-
- }
-
- /**
- * Generates a score based on an overlap of nGrams between two strings using
- * the DiceCoefficient technique.
- *
- * @param s1 first string
- * @param s2 second string
- * @param nGrams number of chars in each gram
- * @return
- */
- public double getDiceCoefficient(String s1, String s2, int nGrams) {
- if (s1.isEmpty() || s2.isEmpty()) {
- return 0d;
- }
- List<String> s1Grams = new ArrayList<>();
- List<String> s2Grams = new ArrayList<>();
- String[] split1 = s1.split("[ ,]");
- for (String token : split1) {
- if (token.trim().equals("")) {
- continue;
- }
- s1Grams.add(token);
- }
- String[] split2 = s2.split("[ ,]");
- for (String token : split2) {
- if (token.trim().equals("")) {
- continue;
- }
- s2Grams.add(token);
- }
-
- Set<String> overlap = new HashSet<String>(s1Grams);
- overlap.retainAll(s2Grams);
- double totcombigrams = overlap.size();
-
- return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
- }
-
- private int minimum(int a, int b, int c) {
- return Math.min(Math.min(a, b), c);
- }
-
- public int getLevenshteinDistance(CharSequence str1,
- CharSequence str2) {
- int[][] distance = new int[str1.length() + 1][str2.length() + 1];
-
- for (int i = 0; i <= str1.length(); i++) {
- distance[i][0] = i;
- }
- for (int j = 1; j <= str2.length(); j++) {
- distance[0][j] = j;
- }
-
- for (int i = 1; i <= str1.length(); i++) {
- for (int j = 1; j <= str2.length(); j++) {
- distance[i][j] = minimum(
- distance[i - 1][j] + 1,
- distance[i][j - 1] + 1,
- distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
- }
- }
-
- return distance[str1.length()][str2.length()];
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * Generates scores based on string comparisons levenstein and dice
+ */
+public class FuzzyStringMatchScorer implements LinkedEntityScorer<AdminBoundaryContext> {
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+ for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
+ for (BaseLink link : linkedSpan.getLinkedEntries()) {
+ if (link instanceof GazetteerEntry) {
+ GazetteerEntry entry = (GazetteerEntry) link;
+ String hierarchy = entry.getHierarchy();
+ if (hierarchy != null) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase(), 2);
+ link.getScoreMap().put("hierarchydicecoef", dice);
+ Double ld = (double) getLevenshteinDistance(linkedSpan.getSearchTerm().toLowerCase(), hierarchy.toLowerCase());
+ link.getScoreMap().put("hierarchylevenshtein", ld);
+ }
+ String placename = entry.getItemName().toLowerCase();
+ if (placename != null) {
+ Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase(), placename, 2);
+ link.getScoreMap().put("placenamedicecoef", dice);
+
+ }
+ }
+ }
+ }
+
+ }
+
+ /**
+ * Generates a score based on an overlap of nGrams between two strings using
+ * the DiceCoefficient technique.
+ *
+ * @param s1 first string
+ * @param s2 second string
+ * @param nGrams number of chars in each gram
+ * @return
+ */
+ public double getDiceCoefficient(String s1, String s2, int nGrams) {
+ if (s1.isEmpty() || s2.isEmpty()) {
+ return 0d;
+ }
+ List<String> s1Grams = new ArrayList<>();
+ List<String> s2Grams = new ArrayList<>();
+ String[] split1 = s1.split("[ ,]");
+ for (String token : split1) {
+ if (token.trim().equals("")) {
+ continue;
+ }
+ s1Grams.add(token);
+ }
+ String[] split2 = s2.split("[ ,]");
+ for (String token : split2) {
+ if (token.trim().equals("")) {
+ continue;
+ }
+ s2Grams.add(token);
+ }
+
+ Set<String> overlap = new HashSet<String>(s1Grams);
+ overlap.retainAll(s2Grams);
+ double totcombigrams = overlap.size();
+
+ return (2 * totcombigrams) / (s1Grams.size() + s2Grams.size());
+ }
+
+ private int minimum(int a, int b, int c) {
+ return Math.min(Math.min(a, b), c);
+ }
+
+ public int getLevenshteinDistance(CharSequence str1,
+ CharSequence str2) {
+ int[][] distance = new int[str1.length() + 1][str2.length() + 1];
+
+ for (int i = 0; i <= str1.length(); i++) {
+ distance[i][0] = i;
+ }
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[0][j] = j;
+ }
+
+ for (int i = 1; i <= str1.length(); i++) {
+ for (int j = 1; j <= str2.length(); j++) {
+ distance[i][j] = minimum(
+ distance[i - 1][j] + 1,
+ distance[i][j - 1] + 1,
+ distance[i - 1][j - 1] + ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
+ }
+ }
+
+ return distance[str1.length()][str2.length()];
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
index d3494e0..98bad74 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/GeoHashBinningScorer.java
@@ -1,62 +1,64 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.addons.geoentitylinker.GazetteerEntry;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Scores toponymns based on geographic point binning. Based on the heuristic
- * that docs are generally about a small amount of locations, so one can detect
- * outliers by finding those points that are not near the majority
- *
- */
-public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> {
-
- private final PointClustering CLUSTERER = new PointClustering();
- private int PRECISION = 3;
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
- //Map<Double, Double> latLongs = new HashMap<Double, Double>();
- List<GazetteerEntry> allGazEntries = new ArrayList<>();
-
- /**
- * collect all the gaz entry references
- */
- for (LinkedSpan<BaseLink> ls : linkedSpans) {
- for (BaseLink bl : ls.getLinkedEntries()) {
- if (bl instanceof GazetteerEntry) {
- allGazEntries.add((GazetteerEntry) bl);
- }
- }
- }
- /**
- * use the point clustering to score each hit
- */
- Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION);
- CLUSTERER.scoreClusters(cluster);
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.addons.geoentitylinker.GazetteerEntry;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Scores toponymns based on geographic point binning. Based on the heuristic
+ * that docs are generally about a small amount of locations, so one can detect
+ * outliers by finding those points that are not near the majority
+ *
+ */
+public class GeoHashBinningScorer implements LinkedEntityScorer<AdminBoundaryContext> {
+
+ private final PointClustering CLUSTERER = new PointClustering();
+ private int PRECISION = 3;
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+ //Map<Double, Double> latLongs = new HashMap<Double, Double>();
+ List<GazetteerEntry> allGazEntries = new ArrayList<>();
+
+ /**
+ * collect all the gaz entry references
+ */
+ for (LinkedSpan<BaseLink> ls : linkedSpans) {
+ for (BaseLink bl : ls.getLinkedEntries()) {
+ if (bl instanceof GazetteerEntry) {
+ allGazEntries.add((GazetteerEntry) bl);
+ }
+ }
+ }
+ /**
+ * use the point clustering to score each hit
+ */
+ Map<String, List<GazetteerEntry>> cluster = CLUSTERER.cluster(allGazEntries, PRECISION);
+ CLUSTERER.scoreClusters(cluster);
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
index 5fb9c5d..843d9b8 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/LinkedEntityScorer.java
@@ -1,40 +1,42 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.util.List;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-
-/**
- * Structure for scoring linked entities. The Map logically represents a pair :
- * "Score type" to the "actual Score."
- * @param <T> a generic for providing additional context
- */
-public interface LinkedEntityScorer<T> {
-
-/**
- * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
- * this method internally affects the reference to linkedSpans that was passed in
- * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
- * @param docText the full text of the document.
- * @param sentenceSpans the sentence spans the correspond to the document text
- * @param properties the entitylinker properties config file
- * @param additionalContext any additional data required to perform the scoring operation
- */
- void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.util.List;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Structure for scoring linked entities. The Map logically represents a pair :
+ * "Score type" to the "actual Score."
+ * @param <T> a generic for providing additional context
+ */
+public interface LinkedEntityScorer<T> {
+
+/**
+ * Scores a collection of linked entities. Implementations should populate the scoreMap in the list of BaseLink for each linkedSpan
+ * this method internally affects the reference to linkedSpans that was passed in
+ * @param linkedSpans the spans that have been linked to some external source and have all the data they need to be scored
+ * @param docText the full text of the document.
+ * @param sentenceSpans the sentence spans the correspond to the document text
+ * @param properties the entitylinker properties config file
+ * @param additionalContext any additional data required to perform the scoring operation
+ */
+ void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
+}
http://git-wip-us.apache.org/repos/asf/opennlp-addons/blob/9adc2525/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
----------------------------------------------------------------------
diff --git a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
index 01b3269..034c526 100644
--- a/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
+++ b/geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/scoring/ModelBasedScorer.java
@@ -1,160 +1,163 @@
-/*
- * Copyright 2013 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package opennlp.addons.geoentitylinker.scoring;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import opennlp.addons.geoentitylinker.AdminBoundaryContext;
-import opennlp.tools.doccat.DoccatModel;
-import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.entitylinker.EntityLinkerProperties;
-import opennlp.tools.entitylinker.BaseLink;
-import opennlp.tools.entitylinker.LinkedSpan;
-import opennlp.tools.util.Span;
-import org.apache.log4j.Logger;
-
-/**
- *
- * Utilizes a doccat model to score toponyms based on surrounding context
- */
-public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> {
-
- private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);
- DocumentCategorizerME documentCategorizerME;
- DoccatModel doccatModel;
- public static final int RADIUS = 200;
- boolean modelexists = false;
-
- @Override
- public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
- try {
- if (doccatModel == null) {
- String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
- if (path.equals("")) {
- return;
- }
- modelexists = true;
- doccatModel = new DoccatModel(new File(path));
- documentCategorizerME = new DocumentCategorizerME(doccatModel);
- }
- Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
- for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
- Map<String, Double> scores = this.getScore(entry.getValue());
- for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
- double score = 0d;
- if (scores.containsKey(link.getItemParentID())) {
- score = scores.get(link.getItemParentID());
- }
- link.getScoreMap().put("countrymodel", score);
- }
- }
-
- } catch (FileNotFoundException ex) {
- LOGGER.error(ex);
- } catch (IOException ex) {
- LOGGER.error(ex);
- } catch (Exception ex) {
- LOGGER.error(ex);
- }
- }
-
- /**
- * generates features using a BagOfWordsfeatureGenerator that are within the
- * radius of a mention within the doctext
- *
- * @param linkedSpans
- * @param sentenceSpans
- * @param docText
- * @param radius
- * @return a map of the index of the linked span to the string of surrounding
- * text: Map<indexofspan,surrounding text>
- */
- public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
- Map<Integer, String> featureBags = new HashMap<>();
- Map<Integer, Integer> nameMentionMap = new HashMap<>();
- /**
- * iterator over the map that contains a mapping of every country code to
- * all of its mentions in the document
- */
- for (int i = 0; i < linkedSpans.size(); i++) {
- LinkedSpan span = linkedSpans.get(i);
- if (span.getLinkedEntries().isEmpty()) {
- //don't care about spans that did not get linked to anything at all; nothing to work with
- continue;
- }
- /**
- * get the sentence the name span was found in, the beginning of the
- * sentence will suffice as a centroid for feature generation around the
- * named entity
- */
- Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
- nameMentionMap.put(i, mentionIdx);
- }
- /**
- * now associate each span to a string that will be used for categorization
- * against the model.
- */
- for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
- featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
- }
-
- return featureBags;
- }
-
- public String getTextChunk(int mentionIdx, String docText, int radius) {
- int docSize = docText.length();
- int left = 0, right = 0;
- left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
- right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
- String chunk = "";
- if (right <= left) {
- chunk = "";
- } else {
- /**
- * don't want to chop any words in half, so take fron the first space to
- * the last space in the chunk string
- */
- chunk = docText.substring(left, right);
- if (left != 0) {
- left = chunk.indexOf(" ");
- }
- right = chunk.lastIndexOf(" ");
- /**
- * now get the substring again with only whole words
- */
- if (left < right) {
- chunk = chunk.substring(left, right);
- }
- }
-
- return chunk;
- }
-
- private Map<String, Double> getScore(String text) throws Exception {
- Map<String, Double> scoreMap = new HashMap<>();
- double[] categorize = documentCategorizerME.categorize(text);
- int catSize = documentCategorizerME.getNumberOfCategories();
- for (int i = 0; i < catSize; i++) {
- String category = documentCategorizerME.getCategory(i);
- scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
- }
- return scoreMap;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.addons.geoentitylinker.scoring;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
+
+import opennlp.addons.geoentitylinker.AdminBoundaryContext;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.entitylinker.BaseLink;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.LinkedSpan;
+import opennlp.tools.util.Span;
+
+/**
+ * Utilizes a doccat model to score toponyms based on surrounding context
+ */
+public class ModelBasedScorer implements LinkedEntityScorer<AdminBoundaryContext> {
+
+ private static final Logger LOGGER = Logger.getLogger(ModelBasedScorer.class);
+ DocumentCategorizerME documentCategorizerME;
+ DoccatModel doccatModel;
+ public static final int RADIUS = 200;
+ boolean modelexists = false;
+
+ @Override
+ public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, AdminBoundaryContext additionalContext) {
+ try {
+ if (doccatModel == null) {
+ String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
+ if (path.equals("")) {
+ return;
+ }
+ modelexists = true;
+ doccatModel = new DoccatModel(new File(path));
+ documentCategorizerME = new DocumentCategorizerME(doccatModel);
+ }
+ Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
+ for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
+ Map<String, Double> scores = this.getScore(entry.getValue());
+ for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
+ double score = 0d;
+ if (scores.containsKey(link.getItemParentID())) {
+ score = scores.get(link.getItemParentID());
+ }
+ link.getScoreMap().put("countrymodel", score);
+ }
+ }
+
+ } catch (FileNotFoundException ex) {
+ LOGGER.error(ex);
+ } catch (IOException ex) {
+ LOGGER.error(ex);
+ } catch (Exception ex) {
+ LOGGER.error(ex);
+ }
+ }
+
+ /**
+ * generates features using a BagOfWordsfeatureGenerator that are within the
+ * radius of a mention within the doctext
+ *
+ * @param linkedSpans
+ * @param sentenceSpans
+ * @param docText
+ * @param radius
+ * @return a map of the index of the linked span to the string of surrounding
+ * text: Map<indexofspan,surrounding text>
+ */
+ public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
+ Map<Integer, String> featureBags = new HashMap<>();
+ Map<Integer, Integer> nameMentionMap = new HashMap<>();
+ /**
+ * iterator over the map that contains a mapping of every country code to
+ * all of its mentions in the document
+ */
+ for (int i = 0; i < linkedSpans.size(); i++) {
+ LinkedSpan span = linkedSpans.get(i);
+ if (span.getLinkedEntries().isEmpty()) {
+ //don't care about spans that did not get linked to anything at all; nothing to work with
+ continue;
+ }
+ /**
+ * get the sentence the name span was found in, the beginning of the
+ * sentence will suffice as a centroid for feature generation around the
+ * named entity
+ */
+ Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
+ nameMentionMap.put(i, mentionIdx);
+ }
+ /**
+ * now associate each span to a string that will be used for categorization
+ * against the model.
+ */
+ for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
+ featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
+ }
+
+ return featureBags;
+ }
+
+ public String getTextChunk(int mentionIdx, String docText, int radius) {
+ int docSize = docText.length();
+ int left = 0, right = 0;
+ left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
+ right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
+ String chunk = "";
+ if (right <= left) {
+ chunk = "";
+ } else {
+ /**
+ * don't want to chop any words in half, so take fron the first space to
+ * the last space in the chunk string
+ */
+ chunk = docText.substring(left, right);
+ if (left != 0) {
+ left = chunk.indexOf(" ");
+ }
+ right = chunk.lastIndexOf(" ");
+ /**
+ * now get the substring again with only whole words
+ */
+ if (left < right) {
+ chunk = chunk.substring(left, right);
+ }
+ }
+
+ return chunk;
+ }
+
+ private Map<String, Double> getScore(String text) throws Exception {
+ Map<String, Double> scoreMap = new HashMap<>();
+ double[] categorize = documentCategorizerME.categorize(text);
+ int catSize = documentCategorizerME.getNumberOfCategories();
+ for (int i = 0; i < catSize; i++) {
+ String category = documentCategorizerME.getCategory(i);
+ scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
+ }
+ return scoreMap;
+ }
+}