You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/10 15:40:01 UTC

svn commit: r1481009 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: data/analysis/TimexTreeAlignmentStatistics.java eval/RemoveTreeAlignedMentions.java

Author: tmill
Date: Fri May 10 13:40:01 2013
New Revision: 1481009

URL: http://svn.apache.org/r1481009
Log:
Two new classes for tree alignment.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java?rev=1481009&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java Fri May 10 13:40:01 2013
@@ -0,0 +1,126 @@
+package org.apache.ctakes.temporal.data.analysis;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
+import org.apache.ctakes.temporal.eval.CommandLine;
+import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMIReader;
+import org.apache.ctakes.temporal.eval.THYMEData;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+public class TimexTreeAlignmentStatistics {
+  static interface Options{
+    @Option(longName = "xmi")
+    public File getXMIDirectory();
+
+    @Option(longName = "patients")
+    public CommandLine.IntegerRanges getPatients();
+
+    @Option(longName = "text")
+    public File getRawTextDirectory();
+  }
+  
+  /**
+   * @param args
+   * @throws IOException 
+   * @throws UIMAException 
+   */
+  public static void main(String[] args) throws UIMAException, IOException {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
+    List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
+    List<Integer> testItems = THYMEData.getTestPatientSets(patientSets);
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(getFilesFor(trainItems, options.getRawTextDirectory()));
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        XMIReader.class,
+        XMIReader.PARAM_XMI_DIRECTORY,
+        options.getXMIDirectory()));
+    AnalysisEngine ae = aggregateBuilder.createAggregate();
+    int numMentions=0;
+    int numMatches=0;
+    
+    for(JCas jCas : new JCasIterable(reader, ae)){
+      //      String docId = DocumentIDAnnotationUtil.getDocumentID(jCas);
+      //      String docId = jCas.
+      //      System.out.println("Document: " + docId);
+      for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+        Collection<TimeMention> mentions = JCasUtil.selectCovered(jCas.getView("GoldView"), TimeMention.class, segment);
+        for(TimeMention mention : mentions){
+          numMentions++;
+          boolean match = false;
+          List<TreebankNode> nodes = JCasUtil.selectCovered(jCas, TreebankNode.class, mention);
+          for(TreebankNode node : nodes){
+            if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){
+              numMatches++;
+              match = true;
+              break;
+            }
+          }
+          if(!match){
+            List<TreebankNode> coveringNodes = JCasUtil.selectCovering(jCas, TreebankNode.class, mention.getBegin(), mention.getEnd());
+            TreebankNode smallestCoveringNode = null;
+            int smallestLen = Integer.MAX_VALUE;
+            for(TreebankNode node : coveringNodes){
+              int len = node.getEnd() - node.getBegin();
+              if(len <  smallestLen){
+                smallestLen = len;
+                smallestCoveringNode = node;
+              }
+            }
+            System.out.println("No alignment for: " + mention.getCoveredText());
+            System.out.println("Smallest covering treebank node is: " + (smallestCoveringNode == null ? "null" : smallestCoveringNode.getCoveredText()));
+            System.out.println(smallestCoveringNode == null ? "no tree" : TreeUtils.tree2str(smallestCoveringNode));
+          }
+        }
+      }
+    }
+    System.out.printf("Found %d mentions, %d match with node spans\n", numMentions, numMatches);
+  }
+
+  private static List<File> getFilesFor(List<Integer> patientSets, File rawTextDirectory) {
+    if ( !rawTextDirectory.exists() ) {
+      return Collections.emptyList();
+    }
+    List<File> files = new ArrayList<File>();
+    for (Integer set : patientSets) {
+      final int setNum = set;
+      for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
+        @Override
+        public boolean accept(File dir, String name) {
+          return name.contains(String.format("ID%03d", setNum));
+        }})) {
+        // skip hidden files like .svn
+        if (!file.isHidden()) {
+          files.add(file);
+        } 
+      }
+    }
+    return files;
+  }
+}

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java?rev=1481009&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java Fri May 10 13:40:01 2013
@@ -0,0 +1,66 @@
+package org.apache.ctakes.temporal.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.util.JCasUtil;
+
+public class RemoveTreeAlignedMentions extends JCasAnnotator_ImplBase {
+
+  public static final String PARAM_GOLDVIEW_NAME = "GOLD_VIEW_NAME";
+  public static Logger logger = Logger.getLogger(RemoveTreeAlignedMentions.class);
+  
+  @ConfigurationParameter(
+      name = PARAM_GOLDVIEW_NAME,
+      mandatory = true,
+      description = "Name of the cas view of gold standard data")
+  private String goldViewName;
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    JCas goldView = null;
+    try {
+      goldView = jCas.getView(goldViewName);
+    } catch (CASException e) {
+      e.printStackTrace();
+      throw new AnalysisEngineProcessException("Could not extract gold view from jcas!", new Object[]{e});
+    }
+    
+    Collection<TimeMention> times = JCasUtil.select(jCas, TimeMention.class);
+    logger.info("File contains: " + times.size() + " timex mentions from first pass.");
+    
+    List<Annotation> removeList = new ArrayList<Annotation>();
+    for(TimeMention time : times){
+      List<TreebankNode> nodes = JCasUtil.selectCovered(jCas, TreebankNode.class, time);
+      boolean match = false;
+      for(TreebankNode node : nodes){
+        if(node.getBegin() == time.getBegin() && node.getEnd() == time.getEnd()){
+          // we have a match
+          match = true;
+          break;
+        }
+      }
+      if(match){
+        // add the mention since it aligns with a tree.
+        removeList.add(time);
+//        time.removeFromIndexes();
+      }     
+    }
+    for(Annotation mention : removeList){
+      mention.removeFromIndexes();
+    }
+    for(TimeMention time : JCasUtil.select(jCas, TimeMention.class)){
+      logger.info("Preserved time mention: " + time.getCoveredText());
+    }
+  }
+}