You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/11/12 12:54:20 UTC

svn commit: r1541014 - in /opennlp/sandbox/apache-opennlp-addons/src/main: java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ resources/

Author: markg
Date: Tue Nov 12 11:54:20 2013
New Revision: 1541014

URL: http://svn.apache.org/r1541014
Log:
OPENNLP-615
Added a scoring impl that utilizes a doccat model to help with toponym resolution. The ModelBasedScorer also contains two static methods for training the model based on the CountryContext information used by the GeoEntityLinker.

Added:
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/resources/
Modified:
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryContext.java Tue Nov 12 11:54:20 2013
@@ -82,7 +82,7 @@ public class CountryContext {
      //   countrydata = getCountryData(properties);
       }
       for (CountryContextEntry entry : countrydata) {
-        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
+        Pattern regex = Pattern.compile(entry.getFull_name_nd_ro().trim(), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
         Matcher rs = regex.matcher(docText);
         String code = entry.getCc1().toLowerCase();
 

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/CountryProximityScorer.java Tue Nov 12 11:54:20 2013
@@ -22,6 +22,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.BaseLink;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
 import opennlp.tools.util.Span;
@@ -35,7 +36,7 @@ public class CountryProximityScorer impl
   String dominantCode = "";
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
 
     score(linkedSpans, additionalContext.getCountryMentions(), additionalContext.getNameCodesMap(), docText, sentenceSpans, 1000);
 
@@ -191,8 +192,12 @@ public class CountryProximityScorer impl
       all.addAll(distanceMap.get(key));
     }
     //get min max for normalization, this could be more efficient
+
     Integer min = all.first();
     Integer max = all.last();
+    if(min==max){
+      min=0;
+    }
     for (String key : distanceMap.keySet()) {
 
       TreeSet<Double> normalizedDistances = new TreeSet<Double>();

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java Tue Nov 12 11:54:20 2013
@@ -18,6 +18,7 @@ package org.apache.opennlp.addons.tools.
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.BaseLink;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
 import opennlp.tools.ngram.NGramGenerator;
@@ -30,7 +31,7 @@ import opennlp.tools.util.Span;
 public class FuzzyStringMatchScorer implements LinkedEntityScorer<CountryContext> {
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
     for (LinkedSpan<BaseLink> linkedSpan : linkedSpans) {
       for (BaseLink link : linkedSpan.getLinkedEntries()) {
         Double dice = getDiceCoefficient(linkedSpan.getSearchTerm().toLowerCase().replace(" ", ""), link.getItemName().toLowerCase().replace(" ", ""), 2);

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerSearcher.java Tue Nov 12 11:54:20 2013
@@ -42,7 +42,7 @@ import opennlp.tools.entitylinker.Entity
  */
 public class GazateerSearcher {
 
-  private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();
+  //private FuzzyStringMatchScorer diceScorer = new FuzzyStringMatchScorer();
   private double scoreCutoff = .75;
   private Directory geonamesIndex;//= new MMapDirectory(new File(indexloc));
   private IndexReader geonamesReader;// = DirectoryReader.open(geonamesIndex);
@@ -72,7 +72,7 @@ public class GazateerSearcher {
         geonamesAnalyzer = new StandardAnalyzer(Version.LUCENE_45);
       }
 
-      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " & CC1:" + code.toUpperCase();// + "~1.0";
+      String luceneQueryString = "FULL_NAME_ND_RO:" + searchString + " AND CC1:" + code.toLowerCase() + "^100";
       QueryParser parser = new QueryParser(Version.LUCENE_45, luceneQueryString, geonamesAnalyzer);
       Query q = parser.parse(luceneQueryString);
 

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java Tue Nov 12 11:54:20 2013
@@ -24,6 +24,7 @@ import opennlp.tools.entitylinker.domain
 import opennlp.tools.util.Span;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.EntityLinker;
+
 /**
  * Links location entities to gazatteers. Currently supports gazateers in a
  * MySql database (NGA and USGS)
@@ -71,7 +72,7 @@ public class GeoEntityLinker implements 
           // geoNamesEntries = geoNamesGaz.find(matches[i], names[i], countryMentions, linkerProperties);
           for (String code : countryMentions.keySet()) {
             if (!code.equals("us")) {
-              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 5, code, linkerProperties));
+              geoNamesEntries.addAll(gazateerSearcher.geonamesFind(matches[i], 10, code, linkerProperties));
             }
           }
 
@@ -100,9 +101,9 @@ public class GeoEntityLinker implements 
     scorers.add(new FuzzyStringMatchScorer());
     scorers.add(new GeoHashBinningScorer());
     scorers.add(new CountryProximityScorer());
-
+    scorers.add(new ModelBasedScorer());
     for (LinkedEntityScorer scorer : scorers) {
-      scorer.score(spans, doctext, sentences, countryContext);
+      scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
     }
     return spans;
   }

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoHashBinningScorer.java Tue Nov 12 11:54:20 2013
@@ -22,6 +22,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.BaseLink;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
 import opennlp.tools.util.Span;
@@ -34,7 +35,7 @@ import opennlp.tools.util.Span;
 public class GeoHashBinningScorer implements LinkedEntityScorer<CountryContext> {
 
   @Override
-  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, CountryContext additionalContext) {
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans,EntityLinkerProperties properties, CountryContext additionalContext) {
      score( linkedSpans);
   }
 

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java?rev=1541014&r1=1541013&r2=1541014&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/LinkedEntityScorer.java Tue Nov 12 11:54:20 2013
@@ -16,6 +16,7 @@
 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
 
 import java.util.List;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
 import opennlp.tools.util.Span;
 
@@ -33,5 +34,5 @@ public interface LinkedEntityScorer<T> {
  * @param additionalContext any additional data required to perform the scoring operation
  * @return void
  */
-  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, T additionalContext);
+  void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, T additionalContext);
 }

Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java?rev=1541014&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java Tue Nov 12 11:54:20 2013
@@ -0,0 +1,255 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.entitylinker.domain.BaseLink;
+import opennlp.tools.entitylinker.domain.LinkedSpan;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+
+/**
+ *
+ * @author Owner
+ */
+public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
+
+  public static ModelBasedScorer scorer;
+
+  static {
+    scorer = new ModelBasedScorer();
+  }
+  DocumentCategorizerME documentCategorizerME;
+  DoccatModel doccatModel;
+  public static final int RADIUS = 100;
+
+  @Override
+  public void score(List<LinkedSpan> linkedSpans, String docText, Span[] sentenceSpans, EntityLinkerProperties properties, CountryContext additionalContext) {
+    try {
+      if (doccatModel == null) {
+        String path = properties.getProperty("opennlp.geoentitylinker.modelbasedscorer.modelpath", "");
+        if (path.equals("")) {
+          return;
+        }
+        doccatModel = new DoccatModel(new File(path));
+
+        documentCategorizerME = new DocumentCategorizerME(doccatModel);
+      }
+      Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
+      for (Map.Entry<Integer, String> entry : proximalFeatures.entrySet()) {
+        Map<String, Double> scores = this.getScore(entry.getValue());
+        for (BaseLink link : (List<BaseLink>) linkedSpans.get(entry.getKey()).getLinkedEntries()) {
+          double score = 0d;
+          if (scores.containsKey(link.getItemParentID())) {
+            score = scores.get(link.getItemParentID());
+          }
+          link.getScoreMap().put("countrymodel", score);
+        }
+      }
+
+    } catch (FileNotFoundException ex) {
+      System.err.println("could not find modelpath using EntityLinkerProperties. Property should be \"opennlp.geoentitylinker.modelbasedscorer.modelpath\"");
+    } catch (IOException ex) {
+      System.err.println(ex);
+    } catch (Exception ex) {
+      Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);
+    }
+  }
+
+  /**
+   * generates features using a BagOfWordsfeatureGenerator that are within the
+   * radius of a mention within the doctext
+   *
+   * @param linkedSpans
+   * @param docText
+   * @param additionalContext
+   * @param radius
+   * @return a map of the index of the linked span to the string of surrounding
+   *         text: Map<indexofspan,surrounding text>
+   */
+  public Map<Integer, String> generateProximalFeatures(List<LinkedSpan> linkedSpans, Span[] sentenceSpans, String docText, int radius) {
+    Map<Integer, String> featureBags = new HashMap<>();
+    Map<Integer, Integer> nameMentionMap = new HashMap<>();
+    /**
+     * iterator over the map that contains a mapping of every country code to
+     * all of its mentions in the document
+     */
+    for (int i = 0; i < linkedSpans.size(); i++) {
+      LinkedSpan span = linkedSpans.get(i);
+      if (span.getLinkedEntries().isEmpty()) {
+        //don't care about spans that did not get linked to anything at all; nothing to work with
+        continue;
+      }
+      /**
+       * get the sentence the name span was found in, the beginning of the
+       * sentence will suffice as a centroid for feature generation around the
+       * named entity
+       */
+      Integer mentionIdx = sentenceSpans[span.getSentenceid()].getStart();
+      nameMentionMap.put(i, mentionIdx);
+    }
+    /**
+     * now associate each span to a string that will be used for categorization
+     * against the model.
+     */
+    for (Map.Entry<Integer, Integer> entry : nameMentionMap.entrySet()) {
+      featureBags.put(entry.getKey(), getTextChunk(entry.getValue(), docText, radius));
+    }
+
+
+    return featureBags;
+  }
+
+  private String getTextChunk(int mentionIdx, String docText, int radius) {
+    int docSize = docText.length();
+    int left = 0, right = 0;
+    left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
+    right = (mentionIdx + radius > docSize) ? docSize : mentionIdx + radius;
+    String chunk = "";
+    if (right <= left) {
+      chunk = "";
+    } else {
+      /**
+       * don't want to chop any words in half, so take fron the first space to
+       * the last space in the chunk string
+       */
+      chunk = docText.substring(left, right);
+      if (left != 0) {
+        left = chunk.indexOf(" ");
+      }
+      right = chunk.lastIndexOf(" ");
+      /**
+       * now get the substring again with only whole words
+       */
+      if (left < right) {
+        chunk = chunk.substring(left, right);
+      }
+    }
+
+    return chunk;
+  }
+
+  private Map<String, Double> getScore(String text) throws Exception {
+    Map<String, Double> scoreMap = new HashMap<>();
+    if (documentCategorizerME == null) {
+      documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));
+    }
+    double[] categorize = documentCategorizerME.categorize(text);
+    int catSize = documentCategorizerME.getNumberOfCategories();
+    for (int i = 0; i < catSize; i++) {
+      String category = documentCategorizerME.getCategory(i);
+      scoreMap.put(category, categorize[documentCategorizerME.getIndex(category)]);
+    }
+    return scoreMap;
+  }
+
+  /**
+   *
+   * @param documents         A list of document texts, for best results try to
+   *                          ensure each country you care about will be
+   *                          represented by the collection
+   * @param annotationOutFile the location where the annotated doccat text file
+   *                          will be stored
+   * @param modelOutFile      the location where the doccat model will be stored
+   * @param properties        the properties where the country context object
+   *                          will find it's country data from this property:
+   *                          opennlp.geoentitylinker.countrycontext.filepath
+   * @throws IOException
+   */
+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
+    CountryContext context = new CountryContext();
+    FileWriter writer = new FileWriter(annotationOutFile, true);
+    for (String docText : documents) {
+
+      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
+      for (String key : modelCountryContext.keySet()) {
+        for (String wordbag : modelCountryContext.get(key)) {
+          writer.write(key + " " + wordbag + "\n");
+        }
+      }
+    }
+
+    writer.close();
+
+    DoccatModel model = null;
+
+    InputStream dataIn = new FileInputStream(annotationOutFile);
+    try {
+
+      ObjectStream<String> lineStream =
+              new PlainTextByLineStream(dataIn, "UTF-8");
+      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+
+      model = DocumentCategorizerME.train("en", sampleStream);
+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
+      model.serialize(modelOut);
+    } catch (IOException e) {
+      // Failed to read or parse training data, training failed
+      e.printStackTrace();
+    }
+
+  }
+
+  /**
+   * generates proximal wordbags within the radius of a country mention within
+   * the doctext based on the country context object
+   *
+   *
+   * @param docText
+   * @param additionalContext
+   * @param radius
+   * @return
+   */
+  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+    Map<String, ArrayList< String>> featureBags = new HashMap<>();
+    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
+    /**
+     * iterator over the map that contains a mapping of every country code to
+     * all of its mentions in the document
+     */
+    for (String code : countryMentions.keySet()) {
+      /**
+       * for each mention, collect features from around each mention, then
+       * consolidate the features into another map
+       */
+      for (int mentionIdx : countryMentions.get(code)) {
+        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
+        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
+        if (featureBags.containsKey(code)) {
+          featureBags.get(code).add(chunk);
+        } else {
+          ArrayList<String> newlist = new ArrayList<>();
+          newlist.add(chunk);
+          featureBags.put(code, newlist);
+        }
+      }
+    }
+    return featureBags;
+  }
+}