You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/13 13:13:34 UTC

svn commit: r1182782 - in /incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor: namefinder/EntityContentProvider.java util/UIMAUtil.java

Author: joern
Date: Thu Oct 13 11:13:34 2011
New Revision: 1182782

URL: http://svn.apache.org/viewvc?rev=1182782&view=rev
Log:
OPENNLP-303 Now uses token annotations from input CAS instead of simple tokenizer.

Added:
    incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java   (with props)
Modified:
    incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java

Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1182782&r1=1182781&r2=1182782&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Thu Oct 13 11:13:34 2011
@@ -27,6 +27,8 @@ import opennlp.tools.tokenize.SimpleToke
 import opennlp.tools.util.Span;
 
 import org.apache.opennlp.caseditor.OpenNLPPreferenceConstants;
+import org.apache.opennlp.caseditor.util.ContainingConstraint;
+import org.apache.opennlp.caseditor.util.UIMAUtil;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIndex;
 import org.apache.uima.cas.FeatureStructure;
@@ -260,6 +262,9 @@ public class EntityContentProvider imple
           
           @Override
           public void run() {
+            
+            // TODO: Check if view is still available, that might be called after view is disposed.
+            
             IStatus status = event.getResult();
             
             if (status.isOK()) {
@@ -378,18 +383,15 @@ public class EntityContentProvider imple
   }
   
   void runNameFinder() {
+    
     IPreferenceStore store = editor.getCasDocumentProvider().getTypeSystemPreferenceStore(editor.getEditorInput());
     String sentenceTypeName = store.getString(OpenNLPPreferenceConstants.SENTENCE_TYPE);
     
-    // TODO: Add check for sentence type name
     if (sentenceTypeName.isEmpty()) {
       nameFinderView.setMessage("Sentence type is not set!");
       return;
     }
     
-
-    // TODO: Add check for additional sentence type names
-    
     String modelPathes[] = store.getString(OpenNLPPreferenceConstants.NAME_FINDER_MODEL_PATH).split(",");
     
     for (int i = 0; i < modelPathes.length; i++) {
@@ -409,51 +411,60 @@ public class EntityContentProvider imple
 
     if (text != null) {
 
+      Type sentenceTypes[] = UIMAUtil.splitTypes(
+          sentenceTypeName + "," +  additionalSentenceTypes, ',', cas.getTypeSystem());
+      
+      if (sentenceTypes == null) {
+        nameFinderView.setMessage("Sentence type does not exist in type system!");
+        return;
+      }
+      
+      String tokenName = store.getString(OpenNLPPreferenceConstants.TOKEN_TYPE);
+      
+      if (tokenName.isEmpty()) {
+        nameFinderView.setMessage("Token type name is not set!");
+        return;
+      }
+      
+      Type tokenType = cas.getTypeSystem().getType(tokenName);
+      
+      if (tokenType == null) {
+        nameFinderView.setMessage("Token type does not exist in type system!");
+        return;
+      }
+      
       List<Span> sentences = new ArrayList<Span>();
-
-      String sentenceTypeNames[] = (sentenceTypeName + "," +  additionalSentenceTypes).split(",");
+      List<Span> tokens = new ArrayList<Span>();
       
-      for (String typeName : sentenceTypeNames) {
-        Type sentenceType = cas.getTypeSystem().getType(typeName.trim()); 
+      for (Iterator<AnnotationFS> sentenceIterator = 
+          UIMAUtil.createMultiTypeIterator(cas, sentenceTypes);
+          sentenceIterator.hasNext();) {
         
-        if (sentenceType == null) {
-          nameFinderView.setMessage("Sentence type does not exist in type system!");
-          return;
-        }
+        AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
+            .next();
         
-        FSIndex<AnnotationFS> sentenceAnnotations = cas
-            .getAnnotationIndex(sentenceType);
+        // TODO: Add code to detect overlapping sentences ... not allowed!
         
-        for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
-            .iterator(); sentenceIterator.hasNext();) {
-          
-          AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
-              .next();
+        sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+        
+        // Performance Note: 
+        // The following code has O(n^2) complexity, can be optimized
+        // by using a token iterate over all tokens and manual weaving.                  
+        
+        FSIndex<AnnotationFS> allTokens = cas.getAnnotationIndex(tokenType);
+        
+        ContainingConstraint containingConstraint = 
+            new ContainingConstraint(sentenceAnnotation);
+        
+        Iterator<AnnotationFS> containingTokens = cas.createFilteredIterator(
+            allTokens.iterator(), containingConstraint);
+        
+        while (containingTokens.hasNext()) {
+          AnnotationFS token = (AnnotationFS) containingTokens.next();
           
-          sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+          tokens.add(new Span(token.getBegin(), token.getEnd()));
         }
       }
-
-      // sort sentences list ... ascending
-      Collections.sort(sentences);
-      
-      // iterate again and create tokens ...
-      List<Span> tokens = new ArrayList<Span>();
-      
-      for (Span sentence : sentences) {
-          String sentText = sentence.getCoveredText(text).toString();
-          
-          // TODO: Extract tokens here! Instead of using the simple tokenizer!
-          
-          Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
-
-          int sentenceOffset = sentence.getStart();
-
-          for (Span token : tokenSpans) {
-            tokens.add(new Span(sentenceOffset + token.getStart(),
-                sentenceOffset + token.getEnd()));
-          }
-      }
       
       List<Span> nameSpans = new ArrayList<Span>();
 
@@ -479,9 +490,22 @@ public class EntityContentProvider imple
         }
       }
       
-      // This will cause issues when it is done while it is running!
+      // Bug: Changing the data of the name finder will cause an issue if it is already running!
+      
       nameFinder.setText(text);
+      
+      if (sentences.size() == 0) {
+        nameFinderView.setMessage("CAS must at least contain one sentence!");
+        return;
+      }
+      
       nameFinder.setSentences(sentences.toArray(new Span[sentences.size()]));
+      
+      if (tokens.size() == 0) {
+        nameFinderView.setMessage("CAS must at least contain one token within a sentence!");
+        return;
+      }
+      
       nameFinder.setTokens(tokens.toArray(new Span[tokens.size()]));
       nameFinder.setVerifiedNames(nameSpans.toArray(new Span[nameSpans.size()]));
       nameFinder.setModelPath(modelPathes, nameTypeNames);

Added: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java?rev=1182782&view=auto
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java (added)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java Thu Oct 13 11:13:34 2011
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.caseditor.util;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.ConstraintFactory;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.FSTypeConstraint;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+
+public class UIMAUtil {
+
+  public static String[] split(String parameter, char splitChar) {
+    
+    String parts[] = parameter.split(Character.toString(splitChar));
+    
+    for (int i = 0; i < parts.length; i++) {
+      parts[i] = parts[i].trim();
+    }
+    
+    return parts;
+  }
+  
+  // TODO: Should throw an exception
+  public static Type[] splitTypes(String typeList, char splitChar, TypeSystem typeSystem) {
+    String typeNames[] = split(typeList, splitChar);
+    
+    Type types[] = new Type[typeNames.length];
+    
+    for (int i = 0; i < typeNames.length; i++) {
+      types[i] = typeSystem.getType(typeNames[i]);
+      
+      if (types[i] == null) {
+        return null; // TODO: Throw an exception instead!
+      }
+    }
+    
+    return types;
+  }
+  
+  public static FSIterator<AnnotationFS> createMultiTypeIterator(CAS cas, Type... types) {
+    
+    if (types.length == 0)
+      throw new IllegalArgumentException("Need at least one type to create an iterator!");
+    
+    ConstraintFactory cf = ConstraintFactory.instance();
+    FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator();
+
+    FSTypeConstraint typeConstraint = cf.createTypeConstraint();
+    
+    for (Type type : types) {
+      typeConstraint.add(type);
+    }
+
+    // Create and use the filtered iterator
+    FSIterator<AnnotationFS> filteredIterator = cas.createFilteredIterator(iterator, typeConstraint);
+    
+    return filteredIterator;
+  }
+}

Propchange: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain