You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/04/08 20:59:49 UTC
svn commit: r1585816 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/I2B2TemporalXMLReader.java eval/EvaluationOfTimeSpans.java
eval/Evaluation_ImplBase.java eval/I2B2Data.java
Author: tmill
Date: Tue Apr 8 18:59:49 2014
New Revision: 1585816
URL: http://svn.apache.org/r1585816
Log:
CTAKES-82: Code to read i2b2 xml and do evals on at least time spans.
Added:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java?rev=1585816&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/I2B2TemporalXMLReader.java Tue Apr 8 18:59:49 2014
@@ -0,0 +1,132 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewURIUtil;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AnalysisEngineFactory;
+
+public class I2B2TemporalXMLReader extends JCasAnnotator_ImplBase {
+ public static final String PARAM_INPUT_DIR = "PARAM_INPUT_DIR";
+ @ConfigurationParameter(
+ name=PARAM_INPUT_DIR,
+ mandatory=true,
+ description="Directory containing i2b2 files to read")
+ protected File inputDir;
+
+ public static final String PARAM_MAP_THYME = "PARAM_MAP_THYME";
+ @ConfigurationParameter(
+ name=PARAM_MAP_THYME,
+ mandatory=false,
+ description="Whether to map i2b2 relations/properties/types to THYME types")
+ protected boolean mapThyme=false;
+
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ File textFile = new File(ViewURIUtil.getURI(jcas));
+ File xmlFile = new File(textFile.getAbsolutePath().substring(0, textFile.getAbsolutePath().length()-4));
+ Map<String,Annotation> id2entity = new HashMap<>();
+
+ // load the XML
+ Element dataElem;
+ try {
+ dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+ } catch (MalformedURLException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (JDOMException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+
+ for(Element timexEl : dataElem.getChild("TAGS").getChildren("TIMEX3")){
+ int begin = Integer.parseInt(timexEl.getAttributeValue("start"))-1;
+ int end = Integer.parseInt(timexEl.getAttributeValue("end"))-1;
+ String timeClass = timexEl.getAttributeValue("type");
+ TimeMention timex = new TimeMention(jcas, begin, end);
+ id2entity.put(timexEl.getAttributeValue("id"), timex);
+ timex.setTimeClass(timeClass);
+ timex.addToIndexes();
+ }
+
+ for(Element eventEl : dataElem.getChild("TAGS").getChildren("EVENT")){
+ int begin = Integer.parseInt(eventEl.getAttributeValue("start"))-1;
+ int end = Integer.parseInt(eventEl.getAttributeValue("end"))-1;
+ Event e = new Event(jcas);
+ EventProperties props = new EventProperties(jcas);
+ EventMention event = new EventMention(jcas, begin, end);
+ id2entity.put(eventEl.getAttributeValue("id"), event);
+ String polarity = eventEl.getAttributeValue("polarity");
+ if(polarity.equals("POS")) event.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
+ else if(polarity.equals("NEG")) event.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
+ String modality = eventEl.getAttributeValue("modality");
+ if(mapThyme){
+ if(modality.equals("FACTUAL")){
+ props.setContextualModality("ACTUAL");
+ }else if(modality.equals("POSSIBLE")){
+ props.setContextualModality("HEDGED");
+ }else if(modality.equals("HYPOTHETICAL") || modality.equals("CONDITIONAL")){
+ props.setContextualModality("HYPOTHETICAL");
+ }else if(modality.equals("PROPOSED")){
+ props.setContextualModality("GENERIC");
+ }
+ }else{
+ props.setContextualModality(modality);
+ }
+ e.setProperties(props);
+ FSArray mentions = new FSArray(jcas,1);
+ mentions.set(0, event);
+ e.setMentions(mentions);
+ event.setEvent(e);
+ e.addToIndexes();
+ event.addToIndexes();
+ props.addToIndexes();
+ }
+
+ for(Element linkEl : dataElem.getChild("TAGS").getChildren("TLINK")){
+ Annotation fromEnt = id2entity.get(linkEl.getAttributeValue("fromID"));
+ Annotation toEnt = id2entity.get(linkEl.getAttributeValue("toID"));
+ String cat = linkEl.getAttributeValue("type");
+ TemporalTextRelation link = new TemporalTextRelation(jcas);
+ RelationArgument arg1 = new RelationArgument(jcas);
+ arg1.setArgument(fromEnt);
+ link.setArg1(arg1);
+ RelationArgument arg2 = new RelationArgument(jcas);
+ arg2.setArgument(toEnt);
+ link.setArg2(arg2);
+ if(mapThyme){
+ throw new UnsupportedOperationException("Mapping to THYME relations is not implemented yet!");
+ }else{
+ link.setCategory(cat);
+ }
+ }
+ }
+
+ public static AnalysisEngineDescription getDescription(File xmlDirectory) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createPrimitiveDescription(
+ I2B2TemporalXMLReader.class,
+ I2B2TemporalXMLReader.PARAM_INPUT_DIR,
+ xmlDirectory);
+ }
+}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1585816&r1=1585815&r2=1585816&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Tue Apr 8 18:59:49 2014
@@ -73,10 +73,20 @@ public class EvaluationOfTimeSpans exten
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
- List<Integer> patientSets = options.getPatients().getList();
- List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
- List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
- List<Integer> testItems = THYMEData.getTestPatientSets(patientSets);
+ List<Integer> trainItems = null;
+ List<Integer> devItems = null;
+ List<Integer> testItems = null;
+
+ List<Integer> patientSets = options.getPatients().getList();
+ if(options.getXMLFormat() == XMLFormat.I2B2){
+ trainItems = I2B2Data.getTrainPatientSets(options.getXMLDirectory());
+ devItems = I2B2Data.getDevPatientSets(options.getXMLDirectory());
+ testItems = I2B2Data.getTestPatientSets(options.getXMLDirectory());
+ }else{
+ trainItems = THYMEData.getTrainPatientSets(patientSets);
+ devItems = THYMEData.getDevPatientSets(patientSets);
+ testItems = THYMEData.getTestPatientSets(patientSets);
+ }
List<Integer> allTrain = new ArrayList<Integer>(trainItems);
List<Integer> allTest = null;
@@ -94,13 +104,13 @@ public class EvaluationOfTimeSpans exten
annotatorClasses.add(TimeAnnotator.class);
annotatorClasses.add(ConstituencyBasedTimeAnnotator.class);
annotatorClasses.add(CRFTimeAnnotator.class);
- annotatorClasses.add(MetaTimeAnnotator.class);
+// annotatorClasses.add(MetaTimeAnnotator.class);
Map<Class<? extends JCasAnnotator_ImplBase>, String[]> annotatorTrainingArguments = Maps.newHashMap();
annotatorTrainingArguments.put(BackwardsTimeAnnotator.class, new String[]{"-c", "0.3"});
annotatorTrainingArguments.put(TimeAnnotator.class, new String[]{"-c", "0.1"});
annotatorTrainingArguments.put(ConstituencyBasedTimeAnnotator.class, new String[]{"-c", "0.3"});
annotatorTrainingArguments.put(CRFTimeAnnotator.class, new String[]{"-p", "c2=0.03"});
- annotatorTrainingArguments.put(MetaTimeAnnotator.class, new String[]{"-p", "c2=0.3"});
+// annotatorTrainingArguments.put(MetaTimeAnnotator.class, new String[]{"-p", "c2=0.3"});
// run one evaluation per annotator class
final Map<Class<?>, AnnotationStatistics<?>> annotatorStats = Maps.newHashMap();
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1585816&r1=1585815&r2=1585816&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Tue Apr 8 18:59:49 2014
@@ -37,6 +37,7 @@ import org.apache.ctakes.constituency.pa
import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
import org.apache.ctakes.core.ae.OverlapAnnotator;
import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.core.resource.FileResourceImpl;
@@ -47,7 +48,9 @@ import org.apache.ctakes.dependency.pars
import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
import org.apache.ctakes.lvg.ae.LvgAnnotator;
import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.parser.berkeley.BerkeleyParserWrapper;
import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.temporal.ae.I2B2TemporalXMLReader;
import org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader;
import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
import org.apache.ctakes.temporal.ae.THYMETreebankReader;
@@ -106,7 +109,7 @@ public abstract class Evaluation_ImplBas
public static final String GOLD_VIEW_NAME = "GoldView";
- enum XMLFormat { Knowtator, Anafora }
+ enum XMLFormat { Knowtator, Anafora, I2B2 }
static interface Options {
@@ -219,7 +222,7 @@ public abstract class Evaluation_ImplBas
}
}
}
- } else {
+ } else if(this.xmlFormat == XMLFormat.Knowtator) {
for (Integer set : patientSets) {
final int setNum = set;
for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
@@ -242,7 +245,33 @@ public abstract class Evaluation_ImplBas
}
}
}
- }
+ }
+ } else if(this.xmlFormat == XMLFormat.I2B2) {
+ File trainDir = new File(this.xmlDirectory, "training");
+ File testDir = new File(this.xmlDirectory, "test");
+ for (Integer pt : patientSets){
+ File xmlTrain = new File(trainDir, pt+".xml");
+ File train = new File(trainDir, pt+".xml.txt");
+ if(train.exists()){
+ if(xmlTrain.exists()){
+ files.add(train);
+ }else{
+ System.err.println("Text file in training has no corresponding xml -- skipping: " + train);
+ }
+ }
+ File xmlText = new File(testDir, pt+".xml");
+ File test = new File(testDir, pt+".xml.txt");
+ if(test.exists()){
+ if(xmlText.exists()){
+ files.add(test);
+ }else{
+ System.err.println("Text file in test has no corresponding xml -- skipping: " + test);
+ }
+ }
+ assert !(train.exists() && test.exists());
+ }
+ } else{
+ throw new UnsupportedOperationException("Did not supply a valid xml format type.");
}
return files;
}
@@ -297,10 +326,20 @@ public abstract class Evaluation_ImplBas
CAS.NAME_DEFAULT_SOFA,
GOLD_VIEW_NAME);
break;
+ case I2B2:
+ aggregateBuilder.add(
+ I2B2TemporalXMLReader.getDescription(this.xmlDirectory),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME);
+ break;
}
// identify segments
- aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
+ if(this.xmlFormat == XMLFormat.I2B2){
+ aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+ }else{
+ aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
+ }
// identify sentences
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
SentenceDetector.class,
@@ -471,7 +510,14 @@ public abstract class Evaluation_ImplBas
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
}else{
// add ctakes constituency parses to system view
- aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
+ aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class,
+ ConstituencyParser.PARAM_MODEL_FILENAME,
+ "org/apache/ctakes/constituency/parser/models/thyme.bin"));
+// aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class,
+// BerkeleyParserWrapper.PARAM_MODEL_FILENAME,
+//
+// "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin"));
+// "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin"));
}
// write out the CAS after all the above annotations
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java?rev=1585816&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/I2B2Data.java Tue Apr 8 18:59:49 2014
@@ -0,0 +1,73 @@
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class I2B2Data {
+
+ private static final int trainDevCutoff = 600; // Nothing special here -- no dev set specified by i2b2 challenege, so I just split it up so there's about 150 train and 40 dev.
+
+ static Pattern filePatt = Pattern.compile("(\\d+).xml");
+
+ public static List<Integer> getTrainPatientSets(File xmlDirectory) {
+ List<Integer> trains = new ArrayList<>();
+
+ File[] files = getAllFiles(xmlDirectory, "training");
+ Matcher m = null;
+ for(File file : files){
+ m = filePatt.matcher(file.getName());
+ if(m.matches()){
+ int ptNum = Integer.parseInt(m.group(1));
+ if(ptNum < trainDevCutoff){
+ trains.add(ptNum);
+ }
+ }
+ }
+ return trains;
+ }
+
+ public static List<Integer> getDevPatientSets(File xmlDirectory) {
+ List<Integer> devs = new ArrayList<>();
+ File[] files = getAllFiles(xmlDirectory, "training");
+ Matcher m = null;
+
+ for(File file : files){
+ m = filePatt.matcher(file.getName());
+ if(m.matches()){
+ int ptNum = Integer.parseInt(m.group(1));
+ if(ptNum >= trainDevCutoff){
+ devs.add(ptNum);
+ }
+ }
+ }
+ return devs;
+ }
+
+ public static List<Integer> getTestPatientSets(File xmlDirectory) {
+ List<Integer> tests = new ArrayList<>();
+ File[] files = getAllFiles(xmlDirectory, "test");
+ Matcher m = null;
+
+ for(File file : files){
+ m = filePatt.matcher(file.getName());
+ if(m.matches()){
+ int ptNum = Integer.parseInt(m.group(1));
+ tests.add(ptNum);
+ }
+ }
+ return tests;
+ }
+
+ private static File[] getAllFiles(File xmlDirectory, String sub){
+ return new File(xmlDirectory, sub).listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".xml");
+ }
+ });
+ }
+
+}