You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/09/08 15:58:00 UTC

svn commit: r1166692 - in /incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor: OpenNLPPreferenceConstants.java namefinder/EntityContentProvider.java namefinder/NameFinderPreferencePage.java

Author: joern
Date: Thu Sep  8 13:57:59 2011
New Revision: 1166692

URL: http://svn.apache.org/viewvc?rev=1166692&view=rev
Log:
OPENNLP-235 Name Finder now supports multiple sentence types.

Modified:
    incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java
    incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
    incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java

Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java Thu Sep  8 13:57:59 2011
@@ -24,6 +24,7 @@ public class OpenNLPPreferenceConstants 
   
   public static final String PARAGRAPH_TYPE = "PARAGRAPH_TYPE";
   public static final String SENTENCE_TYPE = "SENTENCE_TYPE";
+  public static final String ADDITIONAL_SENTENCE_TYPE = "ADDITIONAL_SENTENCE_TYPE";
   public static final String TOKEN_TYPE = "TOKEN_TYPE";
   public static final String NAME_TYPE = "NAME_TYPE";
   public static final String TOKENIZER_MODEL_PATH = "TOKENIZER_MODEL_PATH";

Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Thu Sep  8 13:57:59 2011
@@ -19,6 +19,7 @@ package org.apache.opennlp.caseditor.nam
 
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 
@@ -325,6 +326,7 @@ public class EntityContentProvider imple
   void runNameFinder() {
     IPreferenceStore store = OpenNLPPlugin.getDefault().getPreferenceStore();
     String sentenceTypeName = store.getString(OpenNLPPreferenceConstants.SENTENCE_TYPE);
+    String additionalSentenceTypes = store.getString(OpenNLPPreferenceConstants.ADDITIONAL_SENTENCE_TYPE);
     String nameTypeName = store.getString(OpenNLPPreferenceConstants.NAME_TYPE);
     String modelPath = store.getString(OpenNLPPreferenceConstants.NAME_FINDER_MODEL_PATH);
     
@@ -332,43 +334,56 @@ public class EntityContentProvider imple
     
     // just get it from preference store?!
     // Should have a good way to display an error when the type is incorrect ...
-    Type sentenceType = cas.getTypeSystem().getType(sentenceTypeName); 
+    
     
     String text = cas.getDocumentText();
 
     if (text != null) {
 
-      // TODO: get list of token annotations
-
       List<Span> sentences = new ArrayList<Span>();
-      List<Span> tokens = new ArrayList<Span>();
-      // get a list on name annotations, they will force the
-      // name finder to detect them ... and maybe maintain a negative list
-
-      FSIndex<AnnotationFS> sentenceAnnotations = cas
-          .getAnnotationIndex(sentenceType);
-
-      for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
-          .iterator(); sentenceIterator.hasNext();) {
-
-        AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
-            .next();
 
-        sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+      String sentenceTypeNames[] = (sentenceTypeName + "," +  additionalSentenceTypes) .split(",");
+      
+      for (String typeName : sentenceTypeNames) {
+        Type sentenceType = cas.getTypeSystem().getType(typeName.trim()); 
+      
+        // TODO: If type cannot be mapped, it throws a null pointer exception ...
+        
+        FSIndex<AnnotationFS> sentenceAnnotations = cas
+            .getAnnotationIndex(sentenceType);
+        
+        for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
+            .iterator(); sentenceIterator.hasNext();) {
+          
+          AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
+              .next();
+          
+          sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
 
-        String sentText = sentenceAnnotation.getCoveredText();
+        }
+      }
 
-        Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
+      // sort sentences list ... ascending
+      Collections.sort(sentences);
+      
+      // iterate again and create tokens ...
+      List<Span> tokens = new ArrayList<Span>();
+      
+      for (Span sentence : sentences) {
+          String sentText = sentence.getCoveredText(text).toString();
+          
+          // TODO: Extract tokens here! Instead of using the simple tokenizer!
+          
+          Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
 
-        int sentenceOffset = sentenceAnnotation.getBegin();
+          int sentenceOffset = sentence.getStart();
 
-        for (Span token : tokenSpans) {
-          tokens.add(new Span(sentenceOffset + token.getStart(),
-              sentenceOffset + token.getEnd()));
-        }
+          for (Span token : tokenSpans) {
+            tokens.add(new Span(sentenceOffset + token.getStart(),
+                sentenceOffset + token.getEnd()));
+          }
       }
-
-      // Note: When an annotation is removed, it might still be in the cas ...
+      
       
       List<Span> nameSpans = new ArrayList<Span>();
 

Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java Thu Sep  8 13:57:59 2011
@@ -43,6 +43,11 @@ public class NameFinderPreferencePage ex
             "Model Path", getFieldEditorParent());
     addField(modelPath);
     
+    StringFieldEditor additionalSentenceTypes = new StringFieldEditor(
+        OpenNLPPreferenceConstants.ADDITIONAL_SENTENCE_TYPE,
+        "Additional Sentence Types", getFieldEditorParent());
+    addField(additionalSentenceTypes);
+    
     StringFieldEditor nameType = new StringFieldEditor(
         OpenNLPPreferenceConstants.NAME_TYPE,
         "Name Type", getFieldEditorParent());