You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/07/20 01:08:04 UTC

svn commit: r1148582 - in /incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder: EntityContentProvider.java NameFinderJob.java

Author: joern
Date: Tue Jul 19 23:08:03 2011
New Revision: 1148582

URL: http://svn.apache.org/viewvc?rev=1148582&view=rev
Log:
OPENNLP-235 Name finder is now restricted to existing annotations.

Modified:
    incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
    incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java

Modified: incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1148582&r1=1148581&r2=1148582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Tue Jul 19 23:08:03 2011
@@ -143,6 +143,7 @@ public class EntityContentProvider imple
       nameFinder.setText(text);
       nameFinder.setSentences(sentences.toArray(new Span[sentences.size()]));
       nameFinder.setTokens(tokens.toArray(new Span[tokens.size()]));
+      nameFinder.setVerifiedNames(nameSpans.toArray(new Span[nameSpans.size()]));
       
       nameFinder.schedule();
     }

Modified: incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java?rev=1148582&r1=1148581&r2=1148582&view=diff
==============================================================================
--- incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java (original)
+++ incubator/opennlp/sandbox/opennlp-caseditor-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java Tue Jul 19 23:08:03 2011
@@ -20,9 +20,12 @@ package org.apache.opennlp.caseditor.nam
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameFinderSequenceValidator;
 import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.util.Span;
 
@@ -37,12 +40,38 @@ import org.eclipse.core.runtime.jobs.Job
 // don't change setting, while job is running!
 public class NameFinderJob extends Job {
   
+  // TODO: It should be changed in a way that detected annotations are always
+  // a perfect match with existing annotations
+  static class RestrictedSequencesValidator extends NameFinderSequenceValidator {
+    
+    private Set<Integer> nameIndex = new HashSet<Integer>();
+    
+    // also give it a no-name index
+    void setRestriction(Set<Integer> nameIndex) {
+      this.nameIndex = nameIndex;
+    }
+    
+    @Override
+    public boolean validSequence(int i, String[] inputSequence,
+        String[] outcomesSequence, String outcome) {
+      boolean valid = super.validSequence(i, inputSequence, outcomesSequence, outcome);
+      
+      if (valid && nameIndex.contains(i)) {
+        return outcome.endsWith(NameFinderME.START) || outcome.endsWith(NameFinderME.CONTINUE);
+      }
+      
+      return valid;
+    }
+  }
+  
   private NameFinderME nameFinder;
+  private RestrictedSequencesValidator sequenceValidator;
   
   private String modelPath;
   private String text;
   private Span sentences[];
   private Span tokens[];
+  private Span verifiedNames[] = new Span[0];
   
   private List<Entity> nameList;
   
@@ -65,6 +94,10 @@ public class NameFinderJob extends Job {
   synchronized void setTokens(Span tokens[]) {
     this.tokens = tokens;
   }
+  
+  synchronized void setVerifiedNames(Span verifiedNames[]) {
+    this.verifiedNames = verifiedNames;
+  }
 
   // maybe report result, through an interface?!
   @Override
@@ -77,7 +110,8 @@ public class NameFinderJob extends Job {
         modelIn = NameFinderViewPage.class
             .getResourceAsStream("/en-ner-per.bin");
         TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
-        nameFinder = new NameFinderME(model, null, 5);
+        sequenceValidator = new RestrictedSequencesValidator();
+        nameFinder = new NameFinderME(model, null, 5, sequenceValidator);
       } catch (IOException e) {
         e.printStackTrace();
       } finally {
@@ -113,6 +147,19 @@ public class NameFinderJob extends Job {
           tokenStrings[i] = token.getCoveredText(text).toString();
         }
         
+        Set<Integer> verifiedNameTokens = new HashSet<Integer>();
+        
+        // iterate over names, to find token indexes
+        for (Span verifiedName : verifiedNames) {
+          for (int i = 0; i < sentenceTokens.size(); i++) {
+            if (verifiedName.contains(sentenceTokens.get(i))) {
+              verifiedNameTokens.add(i);
+            }
+          }
+        }
+        
+        sequenceValidator.setRestriction(verifiedNameTokens);
+        
         Span names[] = nameFinder.find(tokenStrings);
         double nameProbs[] = nameFinder.probs(names);