You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/11/14 01:42:04 UTC

svn commit: r1541770 - /opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/

Author: markg
Date: Thu Nov 14 00:42:03 2013
New Revision: 1541770

URL: http://svn.apache.org/r1541770
Log:
OPENNLP-579
Added a SetupUtils class so users can get the Lucene indexes and Country Doccat models built very easily. Also many other small efficiencies.

Added:
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
Modified:
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
    opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java?rev=1541770&r1=1541769&r2=1541770&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/FuzzyStringMatchScorer.java Thu Nov 14 00:42:03 2013
@@ -41,7 +41,7 @@ public class FuzzyStringMatchScorer impl
       }
     }
 
-  
+
   }
 
   /**

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java?rev=1541770&r1=1541769&r2=1541770&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java Thu Nov 14 00:42:03 2013
@@ -1,6 +1,17 @@
 /*
- * To change this template, choose Tools | Templates
- * and open the template in the editor.
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
 
@@ -23,7 +34,7 @@ import org.apache.lucene.util.Version;
 
 /**
  *
- * @author Owner
+ * Creates two lucene indexes, geonames and usgs for use in GeoEntityLinker
  */
 public class GazateerIndexer {
 

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java?rev=1541770&r1=1541769&r2=1541770&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinker.java Thu Nov 14 00:42:03 2013
@@ -33,13 +33,11 @@ import opennlp.tools.entitylinker.Entity
  */
 public class GeoEntityLinker implements EntityLinker<LinkedSpan> {
 
-  // CountryProximityScorer scorer = new CountryProximityScorer();
-//  private MySQLGeoNamesGazLinkable geoNamesGaz;// = new MySQLGeoNamesGazLinkable();
-//  private MySQLUSGSGazLinkable usgsGaz;//= new MySQLUSGSGazLinkable();
   private CountryContext countryContext;
   private Map<String, Set<Integer>> countryMentions;
   private EntityLinkerProperties linkerProperties;
   private GazateerSearcher gazateerSearcher = new GazateerSearcher();
+  private List<LinkedEntityScorer> scorers = new ArrayList<>();
   /**
    * Flag for deciding whether to search gaz only for toponyms within countries
    * that are mentioned in the document
@@ -97,11 +95,12 @@ public class GeoEntityLinker implements 
       }
     }
 
-    List<LinkedEntityScorer<CountryContext>> scorers = new ArrayList<>();
-    scorers.add(new FuzzyStringMatchScorer());
-    scorers.add(new GeoHashBinningScorer());
-    scorers.add(new CountryProximityScorer());
-    scorers.add(new ModelBasedScorer());
+    if (scorers.isEmpty()) {
+      scorers.add(new FuzzyStringMatchScorer());
+      scorers.add(new GeoHashBinningScorer());
+      scorers.add(new CountryProximityScorer());
+      scorers.add(new ModelBasedScorer());
+    }
     for (LinkedEntityScorer scorer : scorers) {
       scorer.score(spans, doctext, sentences, linkerProperties, countryContext);
     }

Added: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java?rev=1541770&view=auto
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java (added)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GeoEntityLinkerSetupUtils.java Thu Nov 14 00:42:03 2013
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2013 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import opennlp.tools.doccat.DoccatModel;
+import opennlp.tools.doccat.DocumentCategorizerME;
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.entitylinker.EntityLinkerProperties;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import static org.apache.opennlp.addons.tools.entitylinker.geoentitylinker.ModelBasedScorer.RADIUS;
+
+
+/**
+ *
+ * Tools for setting up GeoEntityLinker gazateers and doccat scoring model
+ */
+public class GeoEntityLinkerSetupUtils {
+  public static ModelBasedScorer scorer;
+
+  static {
+    scorer = new ModelBasedScorer();
+  }
+    public static void createLuceneIndex(File outputIndexDir, File gazateerInputData, GazateerIndexer.GazType type){
+      GazateerIndexer indexer = new GazateerIndexer();
+      try {
+        indexer.index(outputIndexDir, gazateerInputData, type);
+      } catch (Exception ex) {
+       ex.printStackTrace();
+      }
+    }
+    /**
+   *
+   * @param documents         A list of document texts, for best results try to
+   *                          ensure each country you care about will be
+   *                          represented in the collection
+   * @param annotationOutFile the location where the annotated doccat text file
+   *                          will be stored
+   * @param modelOutFile      the location where the doccat model will be stored
+   * @param properties        the properties where the country context object
+   *                          will find it's country data from this property:
+   *                          opennlp.geoentitylinker.countrycontext.filepath
+   * @throws IOException
+   */
+  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
+    CountryContext context = new CountryContext();
+    FileWriter writer = new FileWriter(annotationOutFile, true);
+    System.out.println("processing " + documents.size() + " documents");
+    for (String docText : documents) {
+      System.out.append(".");
+      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
+      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
+      for (String key : modelCountryContext.keySet()) {
+        for (String wordbag : modelCountryContext.get(key)) {
+          writer.write(key + " " + wordbag + "\n");
+        }
+      }
+    }
+    System.out.println("Document processing complete. Writing traininf data to file");
+    writer.close();
+    System.out.println("Building Doccat model...");
+    DoccatModel model = null;
+
+    InputStream dataIn = new FileInputStream(annotationOutFile);
+    try {
+
+      ObjectStream<String> lineStream =
+              new PlainTextByLineStream(dataIn, "UTF-8");
+      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
+
+      model = DocumentCategorizerME.train("en", sampleStream);
+      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
+      model.serialize(modelOut);
+       System.out.println("Model complete!");
+    } catch (IOException e) {
+      // Failed to read or parse training data, training failed
+      e.printStackTrace();
+    }
+
+  }
+
+  /**
+   * generates proximal wordbags within the radius of a country mention within
+   * the doctext based on the country context object
+   *
+   *
+   * @param docText
+   * @param additionalContext
+   * @param radius
+   * @return
+   */
+  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
+    Map<String, ArrayList< String>> featureBags = new HashMap<>();
+    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
+    /**
+     * iterator over the map that contains a mapping of every country code to
+     * all of its mentions in the document
+     */
+    for (String code : countryMentions.keySet()) {
+      /**
+       * for each mention, collect features from around each mention, then
+       * consolidate the features into another map
+       */
+      for (int mentionIdx : countryMentions.get(code)) {
+        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
+        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
+        if (featureBags.containsKey(code)) {
+          featureBags.get(code).add(chunk);
+        } else {
+          ArrayList<String> newlist = new ArrayList<>();
+          newlist.add(chunk);
+          featureBags.put(code, newlist);
+        }
+      }
+    }
+    return featureBags;
+  }
+
+}

Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java?rev=1541770&r1=1541769&r2=1541770&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/ModelBasedScorer.java Thu Nov 14 00:42:03 2013
@@ -15,46 +15,26 @@
  */
 package org.apache.opennlp.addons.tools.entitylinker.geoentitylinker;
 
-import java.io.BufferedOutputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileWriter;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-import opennlp.tools.doccat.BagOfWordsFeatureGenerator;
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
-import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.doccat.DocumentSampleStream;
 import opennlp.tools.entitylinker.EntityLinkerProperties;
 import opennlp.tools.entitylinker.domain.BaseLink;
 import opennlp.tools.entitylinker.domain.LinkedSpan;
-import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 
 /**
  *
- *Utilizes a doccat model to score toponyms based on surrounding context
+ * Utilizes a doccat model to score toponyms based on surrounding context
  */
 public class ModelBasedScorer implements LinkedEntityScorer<CountryContext> {
 
-  public static ModelBasedScorer scorer;
 
-  static {
-    scorer = new ModelBasedScorer();
-  }
   DocumentCategorizerME documentCategorizerME;
   DoccatModel doccatModel;
   public static final int RADIUS = 100;
@@ -68,7 +48,6 @@ public class ModelBasedScorer implements
           return;
         }
         doccatModel = new DoccatModel(new File(path));
-
         documentCategorizerME = new DocumentCategorizerME(doccatModel);
       }
       Map<Integer, String> proximalFeatures = generateProximalFeatures(linkedSpans, sentenceSpans, docText, RADIUS);
@@ -88,7 +67,7 @@ public class ModelBasedScorer implements
     } catch (IOException ex) {
       System.err.println(ex);
     } catch (Exception ex) {
-      Logger.getLogger(ModelBasedScorer.class.getName()).log(Level.SEVERE, null, ex);
+      System.err.println(ex);
     }
   }
 
@@ -136,7 +115,7 @@ public class ModelBasedScorer implements
     return featureBags;
   }
 
-  private String getTextChunk(int mentionIdx, String docText, int radius) {
+  public String getTextChunk(int mentionIdx, String docText, int radius) {
     int docSize = docText.length();
     int left = 0, right = 0;
     left = (mentionIdx - radius < 0) ? 0 : mentionIdx - radius;
@@ -167,9 +146,6 @@ public class ModelBasedScorer implements
 
   private Map<String, Double> getScore(String text) throws Exception {
     Map<String, Double> scoreMap = new HashMap<>();
-    if (documentCategorizerME == null) {
-      documentCategorizerME = new DocumentCategorizerME(new DoccatModel(new File("")));
-    }
     double[] categorize = documentCategorizerME.categorize(text);
     int catSize = documentCategorizerME.getNumberOfCategories();
     for (int i = 0; i < catSize; i++) {
@@ -179,88 +155,5 @@ public class ModelBasedScorer implements
     return scoreMap;
   }
 
-  /**
-   *
-   * @param documents         A list of document texts, for best results try to
-   *                          ensure each country you care about will be
-   *                          represented by the collection
-   * @param annotationOutFile the location where the annotated doccat text file
-   *                          will be stored
-   * @param modelOutFile      the location where the doccat model will be stored
-   * @param properties        the properties where the country context object
-   *                          will find it's country data from this property:
-   *                          opennlp.geoentitylinker.countrycontext.filepath
-   * @throws IOException
-   */
-  public static void buildCountryContextModel(Collection<String> documents, File annotationOutFile, File modelOutFile, EntityLinkerProperties properties) throws IOException {
-    CountryContext context = new CountryContext();
-    FileWriter writer = new FileWriter(annotationOutFile, true);
-    for (String docText : documents) {
-
-      Map<String, Set<Integer>> regexfind = context.regexfind(docText, properties);
-      Map<String, ArrayList<String>> modelCountryContext = modelCountryContext(docText, context, RADIUS);
-      for (String key : modelCountryContext.keySet()) {
-        for (String wordbag : modelCountryContext.get(key)) {
-          writer.write(key + " " + wordbag + "\n");
-        }
-      }
-    }
-
-    writer.close();
-
-    DoccatModel model = null;
-
-    InputStream dataIn = new FileInputStream(annotationOutFile);
-    try {
-
-      ObjectStream<String> lineStream =
-              new PlainTextByLineStream(dataIn, "UTF-8");
-      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
-
-      model = DocumentCategorizerME.train("en", sampleStream);
-      OutputStream modelOut = new BufferedOutputStream(new FileOutputStream(modelOutFile));
-      model.serialize(modelOut);
-    } catch (IOException e) {
-      // Failed to read or parse training data, training failed
-      e.printStackTrace();
-    }
-
-  }
-
-  /**
-   * generates proximal wordbags within the radius of a country mention within
-   * the doctext based on the country context object
-   *
-   *
-   * @param docText
-   * @param additionalContext
-   * @param radius
-   * @return
-   */
-  public static Map<String, ArrayList<String>> modelCountryContext(String docText, CountryContext additionalContext, int radius) {
-    Map<String, ArrayList< String>> featureBags = new HashMap<>();
-    Map<String, Set<Integer>> countryMentions = additionalContext.getCountryMentions();
-    /**
-     * iterator over the map that contains a mapping of every country code to
-     * all of its mentions in the document
-     */
-    for (String code : countryMentions.keySet()) {
-      /**
-       * for each mention, collect features from around each mention, then
-       * consolidate the features into another map
-       */
-      for (int mentionIdx : countryMentions.get(code)) {
-        String chunk = scorer.getTextChunk(mentionIdx, docText, radius);
-        //   Collection<String> extractFeatures = super.extractFeatures(chunk.split(" "));
-        if (featureBags.containsKey(code)) {
-          featureBags.get(code).add(chunk);
-        } else {
-          ArrayList<String> newlist = new ArrayList<>();
-          newlist.add(chunk);
-          featureBags.put(code, newlist);
-        }
-      }
-    }
-    return featureBags;
-  }
+  
 }