You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/12/06 22:02:28 UTC

svn commit: r1772966 - in /ctakes/trunk: ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/ ...

Author: tmill
Date: Tue Dec  6 22:02:28 2016
New Revision: 1772966

URL: http://svn.apache.org/viewvc?rev=1772966&view=rev
Log:
Coreference annotator creates Events at end of coref process.

Removed:
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java
Modified:
    ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/factory/CoreferenceAnnotatorFactory.java
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java

Modified: ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java (original)
+++ ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java Tue Dec  6 22:02:28 2016
@@ -18,7 +18,21 @@
  */
 package org.apache.ctakes.clinicalpipeline;
 
-import org.apache.ctakes.assertion.medfacts.cleartk.*;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.assertion.medfacts.cleartk.ConditionalCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.HistoryCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
 import org.apache.ctakes.chunker.ae.Chunker;
 import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
 import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
@@ -26,6 +40,7 @@ import org.apache.ctakes.contexttokenize
 import org.apache.ctakes.core.ae.SentenceDetector;
 import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
 import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.coreference.factory.CoreferenceAnnotatorFactory;
 import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
 import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
 import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
@@ -51,11 +66,6 @@ import org.apache.uima.jcas.cas.FSArray;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.xml.sax.SAXException;
 
-import java.io.FileWriter;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.*;
-
 final public class ClinicalPipelineFactory {
 
    private ClinicalPipelineFactory() {
@@ -128,6 +138,16 @@ final public class ClinicalPipelineFacto
       builder.add( ChunkAdjuster.createAnnotatorDescription( new String[] { "NP", "PP", "NP" }, 2 ) );
       return builder.createAggregateDescription();
    }
+   
+   public static AnalysisEngineDescription getCoreferencePipeline() throws ResourceInitializationException, MalformedURLException {
+     AggregateBuilder builder = new AggregateBuilder();
+     
+     builder.add(getFastPipeline());
+     builder.add(ConstituencyParser.createAnnotatorDescription());
+     builder.add(CoreferenceAnnotatorFactory.getDefaultCoreferencePipeline());
+     
+     return builder.createAggregateDescription();
+   }
 
    public static void main( final String... args ) throws IOException, UIMAException, SAXException {
       // The note is easier to read when sentences are stacked - changed 3/16/2015 spf

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java Tue Dec  6 22:02:28 2016
@@ -45,14 +45,14 @@ public class MarkableHeadTreeCreator ext
     try{
       docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
     }catch(Exception e){
-      e.printStackTrace();
+      //System.err.println(e.getMessage());
     }
     if(docId == null || docId == DocumentIDAnnotationUtil.NO_DOCUMENT_ID){
       try {
         docId = ViewUriUtil.getURI(jcas).toString();
-      } catch (AnalysisEngineProcessException e) {
-        e.printStackTrace();
-        logger.warn("No document ID found using traditional methods. Using ad hoc combination");
+      } catch (Exception e) {
+        //System.err.println(e.getMessage());
+        //logger.warn("No document ID found using traditional methods. Using ad hoc combination");
         String docText = jcas.getDocumentText();
         docId = docText.substring(0, Math.min(20, docText.length())) + "_hash=" + docText.hashCode(); 
       }

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Tue Dec  6 22:02:28 2016
@@ -1,13 +1,18 @@
 package org.apache.ctakes.coreference.ae;
 
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import org.apache.ctakes.core.util.ListFactory;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
@@ -24,14 +29,30 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.pairing.cluster.HeadwordPairer;
 import org.apache.ctakes.coreference.ae.pairing.cluster.SectionHeaderPairer;
 import org.apache.ctakes.coreference.ae.pairing.cluster.SentenceDistancePairer;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.typesystem.type.refsem.AnatomicalSite;
+import org.apache.ctakes.typesystem.type.refsem.DiseaseDisorder;
+import org.apache.ctakes.typesystem.type.refsem.Element;
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.Medication;
+import org.apache.ctakes.typesystem.type.refsem.Procedure;
+import org.apache.ctakes.typesystem.type.refsem.SignSymptom;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
 import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.ctakes.utils.struct.MapFactory;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -40,6 +61,7 @@ import org.apache.uima.fit.factory.Analy
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSArray;
 import org.apache.uima.jcas.cas.NonEmptyFSList;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.ml.CleartkAnnotator;
@@ -250,39 +272,6 @@ public class MentionClusterCoreferenceAn
               feature.setValue("NULL");
               String message = String.format("Null value found in %s from %s", feature, features);
               System.err.println(message);
-              //            throw new IllegalArgumentException(String.format(message, feature, features));
-            }else{
-//              String prefix = null;
-              //  Durret and Klein style feature conjunctions: pronoun type or pos tag. maybe try umls semantic-type?
-              /*
-              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
-                prefix = "PRO_"+mentionText;
-              }else if(headNode != null && headNode.getPostag() != null){
-                prefix = headNode.getPostag();                
-              }else{
-                prefix = "UNK";
-              }
-              */
-              // headword-based feature conjunctions
-/*              if(headNode != null && headNode.getCoveredText() != null && headMatches(headNode.getCoveredText().toLowerCase(), features)){
-                prefix = "HEAD_MATCH";
-              }else{
-                prefix = "NO_HEAD_MATCH";
-              }
-*/
-              
-              // UMLS semantic type feature conjunctions
-              /*
-              for(Feature feat : features){
-                if(feat.getName().startsWith("ClusterSemType")){
-                  dupFeatures.add(new Feature(feat.getName()+"_"+feature.getName(), feature.getValue()));
-                }
-              }
-              */
-              
-//              if(prefix != null){
-//                dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
-//              }
             }            
           }
           
@@ -350,6 +339,8 @@ public class MentionClusterCoreferenceAn
     }
     
     removeSingletonClusters(jCas);
+    
+    createEventClusters(jCas);
   }
   
  
@@ -423,6 +414,68 @@ public class MentionClusterCoreferenceAn
     ListFactory.append(jCas, cluster.getMembers(), mention);    
   }
 
+  /**
+   * Create the set of Event types for every chain we found in the document.
+   * Event is a non-Annotation type (i.e., no span) that has its own attributes
+   * but points to an FSArray of mentions which each have their own attributes.
+   * 
+   * @param jCas
+   *        - JCas object, needed to create UIMA types
+   * @throws AnalysisEngineProcessException 
+   */
+  private static void createEventClusters(JCas jCas) throws AnalysisEngineProcessException{
+    // First, find the largest span identified annotation that shares a headword with the markable
+    // do that by finding the head of the markable, then finding the identifiedannotations that cover it:
+    Map<ConllDependencyNode, Collection<IdentifiedAnnotation>> dep2event = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    for(CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)){
+      CounterMap<Class<? extends IdentifiedAnnotation>> headCounts = new CounterMap<>();
+      List<Markable> memberList = new ArrayList<>(JCasUtil.select(cluster.getMembers(), Markable.class));
+      for(Markable member : memberList){
+        ConllDependencyNode head = MapFactory.get(getKey(jCas), member);
+        // Now find all the identified annotations that share this head:
+        IdentifiedAnnotation largest = null;
+        for(IdentifiedAnnotation covering : dep2event.get(head)){
+          if(isUmlsAnnotation(covering) && head == DependencyUtility.getNominalHeadNode(jCas, covering)){
+            if(largest == null || (covering.getEnd()-covering.getBegin() > (largest.getEnd()-largest.getBegin()))){
+              largest = covering;
+            }
+          }            
+        }
+        if(largest != null){
+          headCounts.add(largest.getClass());
+        }
+      }
+      FSArray mentions = new FSArray(jCas, memberList.size());
+      IntStream.range(0, memberList.size()).forEach(i -> mentions.set(i, memberList.get(i)));
+      
+      Element element = null;
+      if(headCounts.size() == 0){
+        element = new Event(jCas);
+      }else{
+        Class<? extends IdentifiedAnnotation> mostCommon = headCounts.entrySet().stream()
+            .sorted(Map.Entry.<Class<? extends IdentifiedAnnotation>,Integer>comparingByValue().reversed())
+            .limit(1)
+            .map(f -> f.getKey())
+            .collect(Collectors.toList()).get(0);
+        if(mostCommon.equals(DiseaseDisorderMention.class)){
+          element = new DiseaseDisorder(jCas);
+        }else if(mostCommon.equals(ProcedureMention.class)){
+          element = new Procedure(jCas);
+        }else if(mostCommon.equals(SignSymptomMention.class)){
+          element = new SignSymptom(jCas);
+        }else if(mostCommon.equals(MedicationMention.class)){
+          element = new Medication(jCas);
+        }else if(mostCommon.equals(AnatomicalSiteMention.class)){
+          element = new AnatomicalSite(jCas);
+        }else{
+          System.err.println("This coreference chain has an unknown type: " + mostCommon.getSimpleName());
+          throw new AnalysisEngineProcessException();
+        }
+      }
+      element.setMentions(mentions);
+      element.addToIndexes();
+    }
+  }
 
   private static void removeSingletonClusters(JCas jcas){
     List<CollectionTextRelation> toRemove = new ArrayList<>();
@@ -438,6 +491,14 @@ public class MentionClusterCoreferenceAn
     }
   }
   
+  private static boolean isUmlsEvent(IdentifiedAnnotation a){
+    return a instanceof DiseaseDisorderMention || a instanceof SignSymptomMention || a instanceof ProcedureMention || a instanceof MedicationMention;
+  }
+  
+  private static boolean isUmlsAnnotation(IdentifiedAnnotation a){
+    return isUmlsEvent(a) || a instanceof AnatomicalSiteMention;
+  }
+  
 //  private static final boolean dominates(Annotation arg1, Annotation arg2) {
 //    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
 //  }

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Tue Dec  6 22:02:28 2016
@@ -34,7 +34,7 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
 import org.apache.ctakes.coreference.ae.MentionClusterRankingCoreferenceAnnotator;
 import org.apache.ctakes.coreference.ae.PersonChainAnnotator;
-import org.apache.ctakes.coreference.util.CoreferencePipelineFactory;
+import org.apache.ctakes.coreference.factory.CoreferenceAnnotatorFactory;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
 import org.apache.ctakes.temporal.ae.BackwardsTimeAnnotator;
@@ -423,7 +423,7 @@ public class EvaluationOfEventCoreferenc
     }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
       aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
     }else if(this.evalType == EVAL_SYSTEM.BASELINE){
-      aggregateBuilder.add(CoreferencePipelineFactory.getCoreferencePipeline());
+      aggregateBuilder.add(CoreferenceAnnotatorFactory.getLegacyCoreferencePipeline());
     }else{
       logger.info("Running an evaluation that does not add an annotator: " + this.evalType);
     }

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/factory/CoreferenceAnnotatorFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/factory/CoreferenceAnnotatorFactory.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/factory/CoreferenceAnnotatorFactory.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/factory/CoreferenceAnnotatorFactory.java Tue Dec  6 22:02:28 2016
@@ -22,6 +22,10 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator;
 import org.apache.ctakes.coreference.ae.MarkableSalienceAnnotator;
 import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MipacqMarkableCreator;
+import org.apache.ctakes.coreference.ae.MipacqMarkableExpander;
+import org.apache.ctakes.coreference.ae.MipacqMarkablePairGenerator;
+import org.apache.ctakes.coreference.ae.MipacqSvmChainCreator;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.fit.factory.AggregateBuilder;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
@@ -58,6 +62,17 @@ public class CoreferenceAnnotatorFactory
     return builder.createAggregateDescription();
   }
   
+  public static AnalysisEngineDescription getLegacyCoreferencePipeline() throws ResourceInitializationException{
+    AggregateBuilder builder = new AggregateBuilder();
+    
+    builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkableCreator.class));
+    builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkableExpander.class));
+    builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkablePairGenerator.class));
+    builder.add(AnalysisEngineFactory.createEngineDescription(MipacqSvmChainCreator.class));
+
+    return builder.createAggregateDescription();
+  }
+  
   // This method will point at the method we think is most likely to be useful for callers of mixed understanding
   // who may not grok the method names for the systems named for their implementation.
   public static AnalysisEngineDescription getDefaultCoreferencePipeline() throws ResourceInitializationException {

Modified: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java?rev=1772966&r1=1772965&r2=1772966&view=diff
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java (original)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java Tue Dec  6 22:02:28 2016
@@ -18,7 +18,11 @@
  */
 package org.apache.ctakes.utils.struct;
 
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
 
 // This class is a simplifying class which makes it easy to build hashes to keep track of counts
 // and write less boilerplate code.  If you just call it with an object, it will increment the
@@ -43,4 +47,12 @@ public class CounterMap<K> extends HashM
 		}
 		super.put(key, super.get(key)+i);
 	}
+	
+	public List<K> getKeysSortedByValue(){
+	  return entrySet()
+        .stream()
+        .sorted(Map.Entry.comparingByValue(Collections.reverseOrder()))
+        .map(e -> e.getKey())
+        .collect(Collectors.toList());
+   	}
 }