You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/04/08 20:59:49 UTC

svn commit: r1585816 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/I2B2TemporalXMLReader.java eval/EvaluationOfTimeSpans.java eval/Evaluation_ImplBase.java eval/I2B2Data.java

Author: tmill
Date: Tue Apr  8 18:59:49 2014
New Revision: 1585816

URL: http://svn.apache.org/r1585816
Log:
CTAKES-82: Code to read i2b2 xml and do evals on at least time spans.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java?rev=1585816&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java Tue Apr  8 18:59:49 2014
@@ -0,0 +1,132 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewURIUtil;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AnalysisEngineFactory;
+
+public class I2B2TemporalXMLReader extends JCasAnnotator_ImplBase {
+  public static final String PARAM_INPUT_DIR = "PARAM_INPUT_DIR";
+  @ConfigurationParameter(
+      name=PARAM_INPUT_DIR,
+      mandatory=true, 
+      description="Directory containing i2b2 files to read")
+  protected File inputDir;
+  
+  public static final String PARAM_MAP_THYME = "PARAM_MAP_THYME";
+  @ConfigurationParameter(
+      name=PARAM_MAP_THYME,
+      mandatory=false,
+      description="Whether to map i2b2 relations/properties/types to THYME types")
+  protected boolean mapThyme=false;
+  
+  @Override
+  public void process(JCas jcas) throws AnalysisEngineProcessException {
+    File textFile = new File(ViewURIUtil.getURI(jcas));
+    File xmlFile = new File(textFile.getAbsolutePath().substring(0, textFile.getAbsolutePath().length()-4));
+    Map<String,Annotation> id2entity = new HashMap<>();
+    
+    // load the XML
+    Element dataElem;
+    try {
+      dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+    } catch (MalformedURLException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (JDOMException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (IOException e) {
+      throw new AnalysisEngineProcessException(e);
+    }
+
+    for(Element timexEl : dataElem.getChild("TAGS").getChildren("TIMEX3")){
+      int begin = Integer.parseInt(timexEl.getAttributeValue("start"))-1;
+      int end = Integer.parseInt(timexEl.getAttributeValue("end"))-1;
+      String timeClass = timexEl.getAttributeValue("type");
+      TimeMention timex = new TimeMention(jcas, begin, end);
+      id2entity.put(timexEl.getAttributeValue("id"), timex);
+      timex.setTimeClass(timeClass);
+      timex.addToIndexes();
+    }
+    
+    for(Element eventEl : dataElem.getChild("TAGS").getChildren("EVENT")){
+      int begin = Integer.parseInt(eventEl.getAttributeValue("start"))-1;
+      int end = Integer.parseInt(eventEl.getAttributeValue("end"))-1;
+      Event e = new Event(jcas);
+      EventProperties props = new EventProperties(jcas);
+      EventMention event = new EventMention(jcas, begin, end);
+      id2entity.put(eventEl.getAttributeValue("id"), event);
+      String polarity = eventEl.getAttributeValue("polarity");
+      if(polarity.equals("POS")) event.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
+      else if(polarity.equals("NEG")) event.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
+      String modality = eventEl.getAttributeValue("modality");
+      if(mapThyme){
+        if(modality.equals("FACTUAL")){
+          props.setContextualModality("ACTUAL");
+        }else if(modality.equals("POSSIBLE")){
+          props.setContextualModality("HEDGED");
+        }else if(modality.equals("HYPOTHETICAL") || modality.equals("CONDITIONAL")){
+          props.setContextualModality("HYPOTHETICAL");
+        }else if(modality.equals("PROPOSED")){
+          props.setContextualModality("GENERIC");
+        }
+      }else{
+        props.setContextualModality(modality);
+      }
+      e.setProperties(props);
+      FSArray mentions = new FSArray(jcas,1);
+      mentions.set(0, event);
+      e.setMentions(mentions);
+      event.setEvent(e);
+      e.addToIndexes();
+      event.addToIndexes();
+      props.addToIndexes();
+    }
+    
+    for(Element linkEl : dataElem.getChild("TAGS").getChildren("TLINK")){
+      Annotation fromEnt = id2entity.get(linkEl.getAttributeValue("fromID"));
+      Annotation toEnt = id2entity.get(linkEl.getAttributeValue("toID"));
+      String cat = linkEl.getAttributeValue("type");
+      TemporalTextRelation link = new TemporalTextRelation(jcas);
+      RelationArgument arg1 = new RelationArgument(jcas);
+      arg1.setArgument(fromEnt);
+      link.setArg1(arg1);
+      RelationArgument arg2 = new RelationArgument(jcas);
+      arg2.setArgument(toEnt);
+      link.setArg2(arg2);
+      if(mapThyme){
+        throw new UnsupportedOperationException("Mapping to THYME relations is not implemented yet!");
+      }else{
+        link.setCategory(cat);
+      }
+    }
+  }
+
+  public static AnalysisEngineDescription getDescription(File xmlDirectory) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        I2B2TemporalXMLReader.class,
+        I2B2TemporalXMLReader.PARAM_INPUT_DIR,
+        xmlDirectory);
+  }
+}

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1585816&r1=1585815&r2=1585816&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Tue Apr  8 18:59:49 2014
@@ -73,10 +73,20 @@ public class EvaluationOfTimeSpans exten
 
 	public static void main(String[] args) throws Exception {
 		Options options = CliFactory.parseArguments(Options.class, args);
-		List<Integer> patientSets = options.getPatients().getList();
-		List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
-		List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
-		List<Integer> testItems = THYMEData.getTestPatientSets(patientSets);
+		List<Integer> trainItems = null;
+    List<Integer> devItems = null;
+    List<Integer> testItems = null;
+		
+    List<Integer> patientSets = options.getPatients().getList();
+    if(options.getXMLFormat() == XMLFormat.I2B2){
+      trainItems = I2B2Data.getTrainPatientSets(options.getXMLDirectory());
+      devItems = I2B2Data.getDevPatientSets(options.getXMLDirectory());
+      testItems = I2B2Data.getTestPatientSets(options.getXMLDirectory());
+    }else{
+      trainItems = THYMEData.getTrainPatientSets(patientSets);
+      devItems = THYMEData.getDevPatientSets(patientSets);
+      testItems = THYMEData.getTestPatientSets(patientSets);
+    }
 		
 		List<Integer> allTrain = new ArrayList<Integer>(trainItems);
 		List<Integer> allTest = null;
@@ -94,13 +104,13 @@ public class EvaluationOfTimeSpans exten
 		annotatorClasses.add(TimeAnnotator.class);
 		annotatorClasses.add(ConstituencyBasedTimeAnnotator.class);
 		annotatorClasses.add(CRFTimeAnnotator.class);
-		annotatorClasses.add(MetaTimeAnnotator.class);
+//		annotatorClasses.add(MetaTimeAnnotator.class);
 		Map<Class<? extends JCasAnnotator_ImplBase>, String[]> annotatorTrainingArguments = Maps.newHashMap();
 		annotatorTrainingArguments.put(BackwardsTimeAnnotator.class, new String[]{"-c", "0.3"});
 		annotatorTrainingArguments.put(TimeAnnotator.class, new String[]{"-c", "0.1"});
 		annotatorTrainingArguments.put(ConstituencyBasedTimeAnnotator.class, new String[]{"-c", "0.3"});
 		annotatorTrainingArguments.put(CRFTimeAnnotator.class, new String[]{"-p", "c2=0.03"});
-		annotatorTrainingArguments.put(MetaTimeAnnotator.class, new String[]{"-p", "c2=0.3"});
+//		annotatorTrainingArguments.put(MetaTimeAnnotator.class, new String[]{"-p", "c2=0.3"});
 
 		// run one evaluation per annotator class
 		final Map<Class<?>, AnnotationStatistics<?>> annotatorStats = Maps.newHashMap();

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1585816&r1=1585815&r2=1585816&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Tue Apr  8 18:59:49 2014
@@ -37,6 +37,7 @@ import org.apache.ctakes.constituency.pa
 import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
 import org.apache.ctakes.core.ae.OverlapAnnotator;
 import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
 import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.core.resource.FileResourceImpl;
@@ -47,7 +48,9 @@ import org.apache.ctakes.dependency.pars
 import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
 import org.apache.ctakes.lvg.ae.LvgAnnotator;
 import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.parser.berkeley.BerkeleyParserWrapper;
 import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.temporal.ae.I2B2TemporalXMLReader;
 import org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader;
 import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
 import org.apache.ctakes.temporal.ae.THYMETreebankReader;
@@ -106,7 +109,7 @@ public abstract class Evaluation_ImplBas
 
   public static final String GOLD_VIEW_NAME = "GoldView";
   
-  enum XMLFormat { Knowtator, Anafora }
+  enum XMLFormat { Knowtator, Anafora, I2B2 }
 
   static interface Options {
 
@@ -219,7 +222,7 @@ public abstract class Evaluation_ImplBas
           }
 				}
 			}
-		} else {
+		} else if(this.xmlFormat == XMLFormat.Knowtator) {
 	  for (Integer set : patientSets) {
 		  final int setNum = set;
 		  for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
@@ -242,7 +245,33 @@ public abstract class Evaluation_ImplBas
 			    }
 			  } 
 		  }
-	  }
+	  } 
+	  } else if(this.xmlFormat == XMLFormat.I2B2) {
+	    File trainDir = new File(this.xmlDirectory, "training");
+	    File testDir = new File(this.xmlDirectory, "test");
+      for (Integer pt : patientSets){
+        File xmlTrain = new File(trainDir, pt+".xml");
+        File train = new File(trainDir, pt+".xml.txt");
+        if(train.exists()){
+          if(xmlTrain.exists()){
+            files.add(train);
+          }else{
+            System.err.println("Text file in training has no corresponding xml -- skipping: " + train);
+          }
+        }
+        File xmlText = new File(testDir, pt+".xml");
+        File test = new File(testDir, pt+".xml.txt");
+        if(test.exists()){
+          if(xmlText.exists()){
+            files.add(test);
+          }else{
+            System.err.println("Text file in test has no corresponding xml -- skipping: " + test);
+          }
+        }
+        assert !(train.exists() && test.exists());
+      }
+    } else{
+      throw new UnsupportedOperationException("Did not supply a valid xml format type.");
     }
     return files;
   }
@@ -297,10 +326,20 @@ public abstract class Evaluation_ImplBas
           CAS.NAME_DEFAULT_SOFA,
           GOLD_VIEW_NAME);
       break;
+    case I2B2:
+      aggregateBuilder.add(
+          I2B2TemporalXMLReader.getDescription(this.xmlDirectory),
+          CAS.NAME_DEFAULT_SOFA,
+          GOLD_VIEW_NAME);
+      break;
     }
 
     // identify segments
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
+    if(this.xmlFormat == XMLFormat.I2B2){
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    }else{
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
+    }
     // identify sentences
     aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
         SentenceDetector.class,
@@ -471,7 +510,14 @@ public abstract class Evaluation_ImplBas
       aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
     }else{
       // add ctakes constituency parses to system view
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class,
+          ConstituencyParser.PARAM_MODEL_FILENAME,
+          "org/apache/ctakes/constituency/parser/models/thyme.bin"));
+//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class,
+//          BerkeleyParserWrapper.PARAM_MODEL_FILENAME,
+//          
+//        "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin"));
+//          "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin"));
     }
     // write out the CAS after all the above annotations
     aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java?rev=1585816&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java Tue Apr  8 18:59:49 2014
@@ -0,0 +1,73 @@
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class I2B2Data {
+
+  private static final int trainDevCutoff = 600;   // Nothing special here -- no dev set specified by i2b2 challenege, so I just split it up so there's about 150 train and 40 dev.
+  
+  static Pattern filePatt = Pattern.compile("(\\d+).xml");
+  
+  public static List<Integer> getTrainPatientSets(File xmlDirectory) {
+    List<Integer> trains = new ArrayList<>();
+    
+    File[] files = getAllFiles(xmlDirectory, "training");
+    Matcher m = null;
+    for(File file : files){
+      m = filePatt.matcher(file.getName());
+      if(m.matches()){
+        int ptNum = Integer.parseInt(m.group(1));
+        if(ptNum < trainDevCutoff){
+          trains.add(ptNum);
+        }
+      }
+    }
+    return trains;
+  }
+
+  public static List<Integer> getDevPatientSets(File xmlDirectory) {
+    List<Integer> devs = new ArrayList<>();
+    File[] files = getAllFiles(xmlDirectory, "training");
+    Matcher m = null;
+    
+    for(File file : files){
+      m = filePatt.matcher(file.getName());
+      if(m.matches()){
+        int ptNum = Integer.parseInt(m.group(1));
+        if(ptNum >= trainDevCutoff){
+          devs.add(ptNum);
+        }
+      }
+    }
+    return devs;
+  }
+
+  public static List<Integer> getTestPatientSets(File xmlDirectory) {
+    List<Integer> tests = new ArrayList<>();
+    File[] files = getAllFiles(xmlDirectory, "test");
+    Matcher m = null;
+    
+    for(File file : files){
+      m = filePatt.matcher(file.getName());
+      if(m.matches()){
+        int ptNum = Integer.parseInt(m.group(1));
+        tests.add(ptNum);
+      }
+    }
+    return tests;
+  }
+  
+  private static File[] getAllFiles(File xmlDirectory, String sub){
+    return new File(xmlDirectory, sub).listFiles(new FilenameFilter() {      
+      public boolean accept(File dir, String name) {
+        return name.endsWith(".xml");
+      }
+    });
+  }
+
+}