You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/08/23 22:33:28 UTC
svn commit: r1697246 [2/2] - in /ctakes/trunk:
ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1697246&r1=1697245&r2=1697246&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Sun Aug 23 20:33:28 2015
@@ -18,39 +18,9 @@
*/
package org.apache.ctakes.temporal.eval;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerConfigurationException;
-import javax.xml.transform.TransformerException;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-
+import com.google.common.collect.Lists;
+import com.google.common.io.CharStreams;
+import com.lexicalscope.jewel.cli.Option;
import org.apache.ctakes.chunker.ae.Chunker;
import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
@@ -59,14 +29,10 @@ import org.apache.ctakes.contexttokenize
import org.apache.ctakes.core.ae.OverlapAnnotator;
import org.apache.ctakes.core.ae.SentenceDetector;
import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
-import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
import org.apache.ctakes.core.resource.FileLocator;
-import org.apache.ctakes.core.resource.FileResourceImpl;
import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
-import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator;
import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
-import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
import org.apache.ctakes.lvg.ae.LvgAnnotator;
import org.apache.ctakes.postagger.POSTagger;
import org.apache.ctakes.temporal.ae.I2B2TemporalXMLReader;
@@ -104,7 +70,6 @@ import org.apache.uima.fit.component.Vie
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
-import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.fit.factory.TypePrioritiesFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
@@ -112,7 +77,6 @@ import org.apache.uima.fit.util.JCasUtil
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasCopier;
import org.apache.uima.util.XMLSerializer;
@@ -125,1057 +89,1081 @@ import org.w3c.dom.Element;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.Lists;
-import com.google.common.io.CharStreams;
-import com.lexicalscope.jewel.cli.Option;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.*;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+import java.io.*;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URL;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
-org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
+ org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
+
+ private static Logger LOGGER = Logger.getLogger( Evaluation_ImplBase.class );
+
+ private static final String LOOKUP_PATH = "/org/apache/ctakes/temporal/badEEContainNotes.txt";
- private static Logger LOGGER = Logger.getLogger(Evaluation_ImplBase.class);
+ private static boolean isTraining;
- private static final String LOOKUP_PATH = "/org/apache/ctakes/temporal/badEEContainNotes.txt";
-
- private static boolean isTraining;
-
- public static HashSet<String> badNotes;
-
- public static final String GOLD_VIEW_NAME = "GoldView";
-
- public enum XMLFormat { Knowtator, Anafora, I2B2 }
-
- public enum Subcorpus { Colon, Brain, DeepPhe}
-
- public static interface Options {
-
- @Option(longName = "text", defaultToNull = true)
- public File getRawTextDirectory();
-
- @Option(longName = "xml")
- public File getXMLDirectory();
-
- @Option(longName = "format", defaultValue="Anafora")
- public XMLFormat getXMLFormat();
-
- @Option(longName = "subcorpus", defaultValue="Colon")
- public Subcorpus getSubcorpus();
-
- @Option(longName = "xmi")
- public File getXMIDirectory();
-
- @Option(longName = "patients")
- public CommandLine.IntegerRanges getPatients();
-
- @Option(longName = "train-remainders", defaultValue = "0-2")
- public CommandLine.IntegerRanges getTrainRemainders();
-
- @Option(longName = "dev-remainders", defaultValue = "3")
- public CommandLine.IntegerRanges getDevRemainders();
-
- @Option(longName = "test-remainders", defaultValue = "4-5")
- public CommandLine.IntegerRanges getTestRemainders();
-
- @Option(longName = "treebank", defaultToNull=true)
- public File getTreebankDirectory();
-
- @Option
- public boolean getUseGoldTrees();
-
- @Option
- public boolean getGrid();
-
- @Option
- public boolean getPrintErrors();
-
- @Option
- public boolean getPrintOverlappingSpans();
-
- @Option
- public boolean getTest();
-
- @Option(longName = "kernelParams", defaultToNull=true)
- public String getKernelParams();
-
- @Option(defaultToNull=true)
- public String getI2B2Output();
- }
-
- public static List<Integer> getTrainItems(Options options) {
- List<Integer> patientSets = options.getPatients().getList();
- List<Integer> trainItems = THYMEData.getPatientSets(patientSets, options.getTrainRemainders().getList());
- if (options.getTest()) {
- trainItems.addAll(THYMEData.getPatientSets(patientSets, options.getDevRemainders().getList()));
- }
- return trainItems;
- }
-
- public static List<Integer> getTestItems(Options options) {
- List<Integer> patientSets = options.getPatients().getList();
- List<Integer> testItems;
- if (options.getTest()) {
- testItems = THYMEData.getPatientSets(patientSets, options.getTestRemainders().getList());
- } else {
- testItems = THYMEData.getPatientSets(patientSets, options.getDevRemainders().getList());
- }
- return testItems;
- }
-
- protected File rawTextDirectory;
-
- protected File xmlDirectory;
-
- protected XMLFormat xmlFormat;
-
- protected Subcorpus subcorpus;
-
- protected File xmiDirectory;
-
- private boolean xmiExists;
-
- protected File treebankDirectory;
-
- protected boolean printErrors = false;
-
- protected boolean printOverlapping = false;
-
- protected String i2b2Output = null;
-
- protected String[] kernelParams;
-
- public Evaluation_ImplBase(
- File baseDirectory,
- File rawTextDirectory,
- File xmlDirectory,
- XMLFormat xmlFormat,
- Subcorpus subcorpus,
- File xmiDirectory,
- File treebankDirectory) {
- super(baseDirectory);
- this.rawTextDirectory = rawTextDirectory;
- this.xmlDirectory = xmlDirectory;
- this.xmlFormat = xmlFormat;
- this.subcorpus = subcorpus;
- this.xmiDirectory = xmiDirectory;
- this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0;
- this.treebankDirectory = treebankDirectory;
-
- this.isTraining = true;
- this.badNotes = new HashSet<>();
- URL url = TimeWordsExtractor.class.getResource(LOOKUP_PATH);
- try (BufferedReader br = new BufferedReader(new FileReader(url.getFile()))) {
- String line;
- while ((line = br.readLine()) != null) {
- badNotes.add(line.trim());
- }
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
-
- public void setI2B2Output(String outDir){
- i2b2Output = outDir;
- }
-
- public void prepareXMIsFor(List<Integer> patientSets) throws Exception {
- boolean needsXMIs = false;
- for (File textFile : this.getFilesFor(patientSets)) {
- if (!getXMIFile(this.xmiDirectory, textFile).exists()) {
- needsXMIs = true;
- break;
- }
- }
- if (needsXMIs) {
- CollectionReader reader = this.getCollectionReader(patientSets);
- AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate();
- SimplePipeline.runPipeline(reader, engine);
- }
- this.xmiExists = true;
- }
-
- private List<File> getFilesFor(List<Integer> patientSets) throws FileNotFoundException {
- List<File> files = new ArrayList<>();
- if (this.xmlFormat == XMLFormat.Anafora) {
- Set<String> ids = new HashSet<>();
- for (Integer set : patientSets) {
- if(this.subcorpus == Subcorpus.Colon){
- ids.add(String.format("ID%03d", set));
- }else if(this.subcorpus == Subcorpus.DeepPhe){
- ids.add(String.format("patient%02d", set));
- }else{
- ids.add(String.format("doc%04d", set));
- }
- }
- int filePrefixLen = 5; // Colon: "ID\d{3}"
- if(this.subcorpus == Subcorpus.Brain){
- filePrefixLen = 7; // Brain: "doc\d{4}"
- }else if(this.subcorpus == Subcorpus.DeepPhe){
- filePrefixLen = 9; // deepPhe: "patient\d{2}"
- }
- if(this.subcorpus == Subcorpus.DeepPhe){
- for (File dir : this.xmlDirectory.listFiles()) {
- if (dir.isDirectory()) {
- if (ids.contains(dir.getName().substring(0, filePrefixLen))) {
- File file = new File(dir, dir.getName());
- if (file.exists()) {
- files.add(file);
- } else {
- LOGGER.warn("Missing note: " + file);
- }
- }
- }
- }
- }else{
- for (String section : THYMEData.SECTIONS){
- File xmlSubdir = new File(this.xmlDirectory, section);
- for (File dir : xmlSubdir.listFiles()) {
- if (dir.isDirectory()) {
- if (ids.contains(dir.getName().substring(0, filePrefixLen))) {
- File file = new File(dir, dir.getName());
- if (file.exists()) {
- files.add(file);
- } else {
- LOGGER.warn("Missing note: " + file);
- }
- }
- }
- }
- }
- }
- } else if(this.xmlFormat == XMLFormat.I2B2) {
- File trainDir = new File(this.xmlDirectory, "training");
- File testDir = new File(this.xmlDirectory, "test");
- for (Integer pt : patientSets){
- File xmlTrain = new File(trainDir, pt+".xml");
- File train = new File(trainDir, pt+".xml.txt");
- if(train.exists()){
- if(xmlTrain.exists()){
- files.add(train);
- }else{
- System.err.println("Text file in training has no corresponding xml -- skipping: " + train);
- }
- }
- File xmlTest = new File(testDir, pt+".xml");
- File test = new File(testDir, pt+".xml.txt");
- if(xmlTest.exists()){
- if(test.exists()){
- files.add(test);
- }else{
- throw new FileNotFoundException("Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set.");
- }
- }
- assert !(train.exists() && test.exists());
- }
- } else if( xmlFormat == XMLFormat.Knowtator) {
- LOGGER.warn("This is an old annotation format -- please upgrade to using anafora files.");
- for (Integer set : patientSets) {
- final int setNum = set;
- for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
- @Override
- public boolean accept(File dir, String name) {
- return name.contains(String.format("ID%03d", setNum));
- }})) {
- // skip hidden files like .svn
- if (!file.isHidden()) {
- files.add(file);
- }
- }
- }
- } else {
- LOGGER.error("Unknown data format -- please specify Anafora, i2b2, or Knowtator format.");
- }
- return files;
- }
-
- @Override
- protected CollectionReader getCollectionReader(List<Integer> patientSets) throws Exception {
- List<File> collectedFiles = this.getFilesFor(patientSets);
- /**
- if(isTraining){
- final Collection<File> filesToRemove = new HashSet<>();
- for ( File xmiFile : collectedFiles ) {
- String fname = xmiFile.getName();
- if(this.badNotes.contains(fname)){
- LOGGER.error("Find Bad XMI file: "+fname);
- filesToRemove.add( xmiFile );
- }
- }
- collectedFiles.removeAll( filesToRemove );
- }
- isTraining = false;
- */
- return UriCollectionReader.getCollectionReaderFromFiles(collectedFiles);
- }
-
- protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception {
- return this.xmiExists
- ? this.getXMIReadingPreprocessorAggregateBuilder()
- : this.getXMIWritingPreprocessorAggregateBuilder();
- }
-
- protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException {
- AggregateBuilder aggregateBuilder = new AggregateBuilder();
- aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- XMIReader.class,
- XMIReader.PARAM_XMI_DIRECTORY,
- this.xmiDirectory));
- return aggregateBuilder;
- }
-
- protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
- throws Exception {
- AggregateBuilder aggregateBuilder = new AggregateBuilder();
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(UriToDocumentTextAnnotatorCtakes.class));
-
- // read manual annotations into gold view
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- ViewCreatorAnnotator.class,
- ViewCreatorAnnotator.PARAM_VIEW_NAME,
- GOLD_VIEW_NAME));
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- ViewTextCopierAnnotator.class,
- ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
- CAS.NAME_DEFAULT_SOFA,
- ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
- GOLD_VIEW_NAME));
- switch (this.xmlFormat) {
- case Anafora:
- aggregateBuilder.add(
- THYMEAnaforaXMLReader.getDescription(this.xmlDirectory),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME);
- break;
- case Knowtator:
- aggregateBuilder.add(
- THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME);
- break;
- case I2B2:
- aggregateBuilder.add(
- I2B2TemporalXMLReader.getDescription(this.xmlDirectory),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME);
- break;
- }
-
- // identify segments
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SegmentsFromBracketedSectionTagsAnnotator.class));
-
- // identify sentences
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- SentenceDetector.class,
- SentenceDetector.SD_MODEL_FILE_PARAM,
- "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
- // identify tokens
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
- // merge some tokens
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ContextDependentTokenizerAnnotator.class));
-
- // identify part-of-speech tags
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- POSTagger.class,
- TypeSystemDescriptionFactory.createTypeSystemDescription(),
- TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
- POSTagger.POS_MODEL_FILE_PARAM,
- "org/apache/ctakes/postagger/models/mayo-pos.zip"));
-
- // identify chunks
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- Chunker.class,
- Chunker.CHUNKER_MODEL_FILE_PARAM,
- FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
- Chunker.CHUNKER_CREATOR_CLASS_PARAM,
- DefaultChunkCreator.class));
-
- // identify UMLS named entities
-
- // adjust NP in NP NP to span both
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- ChunkAdjuster.class,
- ChunkAdjuster.PARAM_CHUNK_PATTERN,
- new String[] { "NP", "NP" },
- ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
- 1));
- // adjust NP in NP PP NP to span all three
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- ChunkAdjuster.class,
- ChunkAdjuster.PARAM_CHUNK_PATTERN,
- new String[] { "NP", "PP", "NP" },
- ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
- 2));
- // add lookup windows for each NP
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
- // maximize lookup windows
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- OverlapAnnotator.class,
- "A_ObjectClass",
- LookupWindowAnnotation.class,
- "B_ObjectClass",
- LookupWindowAnnotation.class,
- "OverlapType",
- "A_ENV_B",
- "ActionType",
- "DELETE",
- "DeleteAction",
- new String[] { "selector=B" }));
- // add UMLS on top of lookup windows
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DefaultJCasTermAnnotator.class,
- AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
- "org.apache.ctakes.typesystem.type.textspan.Sentence",
- JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY, "org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml")
- );
-
- aggregateBuilder.add(LvgAnnotator.createAnnotatorDescription());
-
- // add dependency parser
- aggregateBuilder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
-
- // add semantic role labeler
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ClearNLPSemanticRoleLabelerAE.class));
-
- // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
- if(this.treebankDirectory != null){
- aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory));
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TimexAnnotationCorrector.class));
- }else{
- // add ctakes constituency parses to system view
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ConstituencyParser.class,
- ConstituencyParser.PARAM_MODEL_FILENAME,
- "org/apache/ctakes/constituency/parser/models/thyme.bin"));
- }
- // write out the CAS after all the above annotations
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
- XMIWriter.class,
- XMIWriter.PARAM_XMI_DIRECTORY,
- this.xmiDirectory));
-
- return aggregateBuilder;
- }
-
- public static <T extends Annotation> List<T> selectExact(JCas jCas, Class<T> annotationClass, Segment segment) {
- List<T> annotations = Lists.newArrayList();
- for (T annotation : JCasUtil.selectCovered(jCas, annotationClass, segment)) {
- if (annotation.getClass().equals(annotationClass)) {
- annotations.add(annotation);
- }
- }
- return annotations;
- }
-
- public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
- if (chunk.getChunkType().equals("NP")) {
- new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
- }
- }
- }
- }
-
- public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- List<LookupWindowAnnotation> lws = new ArrayList<>(JCasUtil.select(jCas, LookupWindowAnnotation.class));
- // we'll navigate backwards so that as we delete things we shorten the list from the back
- for(int i = lws.size()-2; i >= 0; i--){
- LookupWindowAnnotation lw1 = lws.get(i);
- LookupWindowAnnotation lw2 = lws.get(i+1);
- if(lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()){
- /// lw1 envelops or encloses lw2
- lws.remove(i+1);
- lw2.removeFromIndexes();
- }
- }
-
- }
-
- }
-
- public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) {
- mention.removeFromIndexes();
- }
- }
- }
-
- public static class EventMentionRemover extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) {
- mention.removeFromIndexes();
- }
- }
- }
-
- // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed
- public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase {
- private static Pattern SECTION_PATTERN = Pattern.compile(
- "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])",
- Pattern.DOTALL);
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- boolean foundSections = false;
- Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText());
- while (matcher.find()) {
- Segment segment = new Segment(jCas);
- segment.setBegin(matcher.start() + matcher.group(1).length());
- segment.setEnd(matcher.end() - matcher.group(3).length());
- segment.setId(matcher.group(2));
- segment.addToIndexes();
- foundSections = true;
- }
- if(!foundSections){
- Segment segment = new Segment(jCas);
- segment.setBegin(0);
- segment.setEnd(jCas.getDocumentText().length());
- segment.setId("SIMPLE_SEGMENT");
- segment.addToIndexes();
- }
- }
- }
-
- static File getXMIFile(File xmiDirectory, File textFile) {
- return new File(xmiDirectory, textFile.getName() + ".xmi");
- }
-
- static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException {
- return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
- }
-
- public static class XMIWriter extends JCasAnnotator_ImplBase {
-
- public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
-
- @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
- private File xmiDirectory;
-
- @Override
- public void initialize(UimaContext context) throws ResourceInitializationException {
- super.initialize(context);
- if (!this.xmiDirectory.exists()) {
- this.xmiDirectory.mkdirs();
- }
- }
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- File xmiFile = getXMIFile(this.xmiDirectory, jCas);
- try {
- FileOutputStream outputStream = new FileOutputStream(xmiFile);
- try {
- XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
- ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
- serializer.serialize(jCas.getCas(), handler);
- } finally {
- outputStream.close();
- }
- } catch (SAXException e) {
- throw new AnalysisEngineProcessException(e);
- } catch (IOException e) {
- throw new AnalysisEngineProcessException(e);
- }
- }
- }
-
- public static class XMIReader extends JCasAnnotator_ImplBase {
-
- public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
-
- @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
- private File xmiDirectory;
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- File xmiFile = getXMIFile(this.xmiDirectory, jCas);
- try {
- FileInputStream inputStream = new FileInputStream(xmiFile);
- try {
- XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
- } finally {
- inputStream.close();
- }
- } catch (SAXException e) {
- throw new AnalysisEngineProcessException(e);
- } catch (IOException e) {
- throw new AnalysisEngineProcessException(e);
- }
- }
- }
-
- public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase {
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- JCas goldView, systemView;
- try {
- goldView = jCas.getView(GOLD_VIEW_NAME);
- systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
- } catch (CASException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException();
- }
- for(TimeMention mention : JCasUtil.select(goldView, TimeMention.class)){
- // for each time expression, get the treebank node with the same span.
- List<TreebankNode> nodes = JCasUtil.selectCovered(systemView, TreebankNode.class, mention);
- TreebankNode sameSpanNode = null;
- for(TreebankNode node : nodes){
- if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){
- sameSpanNode = node;
- break;
- }
- }
- if(sameSpanNode != null){
- // look at node at the position of the timex3.
- if(sameSpanNode.getNodeType().equals("PP")){
- // if it is a PP it should be moved down to the NP
- int numChildren = sameSpanNode.getChildren().size();
- if(numChildren == 2 && sameSpanNode.getChildren(0).getNodeType().equals("IN") && sameSpanNode.getChildren(1).getNodeType().equals("NP")){
- // move the time span to this node:
- TreebankNode mentionNode = sameSpanNode.getChildren(numChildren-1);
- mention.setBegin(mentionNode.getBegin());
- mention.setEnd(mentionNode.getEnd());
- }
- }
- }else{
- // if there is no matching tree span, see if the DT to the left would help.
- // now adjust for missing DT to the left
- List<TerminalTreebankNode> precedingPreterms = JCasUtil.selectPreceding(systemView, TerminalTreebankNode.class, mention, 1);
- if(precedingPreterms != null && precedingPreterms.size() == 1){
- TerminalTreebankNode leftTerm = precedingPreterms.get(0);
- if(leftTerm.getNodeType().equals("DT")){
- // now see if adding this would make it match a tree
- List<TreebankNode> matchingNodes = JCasUtil.selectCovered(systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd());
- for(TreebankNode node : matchingNodes){
- if(node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd()){
- sameSpanNode = node;
- break;
- }
- }
- if(sameSpanNode != null){
- // adding the DT to the left of th emention made it match a tree:
- System.err.println("Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText());
- mention.setBegin(leftTerm.getBegin());
- }
- }
- }
- }
- }
- }
- }
-
-
- public static class CopyFromGold extends JCasAnnotator_ImplBase {
-
- public static AnalysisEngineDescription getDescription(Class<?>... classes)
- throws ResourceInitializationException {
- return AnalysisEngineFactory.createEngineDescription(
- CopyFromGold.class,
- CopyFromGold.PARAM_ANNOTATION_CLASSES,
- classes);
- }
-
- public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";
-
- @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true)
- private Class<? extends TOP>[] annotationClasses;
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- JCas goldView, systemView;
- try {
- goldView = jCas.getView(GOLD_VIEW_NAME);
- systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
- } catch (CASException e) {
- throw new AnalysisEngineProcessException(e);
- }
- for (Class<? extends TOP> annotationClass : this.annotationClasses) {
- for (TOP annotation : Lists.newArrayList(JCasUtil.select(systemView, annotationClass))) {
- if (annotation.getClass().equals(annotationClass)) {
- annotation.removeFromIndexes();
- }
- }
- }
- CasCopier copier = new CasCopier(goldView.getCas(), systemView.getCas());
- Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA);
- for (Class<? extends TOP> annotationClass : this.annotationClasses) {
- for (TOP annotation : JCasUtil.select(goldView, annotationClass)) {
- TOP copy = (TOP) copier.copyFs(annotation);
- if (copy instanceof Annotation) {
- copy.setFeatureValue(sofaFeature, systemView.getSofa());
- }
- copy.addToIndexes(systemView);
- }
- }
- }
- }
-
- public static class CopyFromSystem extends JCasAnnotator_ImplBase {
-
- public static AnalysisEngineDescription getDescription(Class<?>... classes)
- throws ResourceInitializationException {
- return AnalysisEngineFactory.createEngineDescription(
- CopyFromSystem.class,
- CopyFromSystem.PARAM_ANNOTATION_CLASSES,
- classes);
- }
-
- public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";
-
- @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true)
- private Class<? extends TOP>[] annotationClasses;
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- JCas goldView, systemView;
- try {
- goldView = jCas.getView(GOLD_VIEW_NAME);
- systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
- } catch (CASException e) {
- throw new AnalysisEngineProcessException(e);
- }
- for (Class<? extends TOP> annotationClass : this.annotationClasses) {
- for (TOP annotation : Lists.newArrayList(JCasUtil.select(goldView, annotationClass))) {
- if (annotation.getClass().equals(annotationClass)) {
- annotation.removeFromIndexes();
- }
- }
- }
- CasCopier copier = new CasCopier(systemView.getCas(), goldView.getCas());
- Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA);
- for (Class<? extends TOP> annotationClass : this.annotationClasses) {
- for (TOP annotation : JCasUtil.select(systemView, annotationClass)) {
- TOP copy = (TOP) copier.copyFs(annotation);
- if (copy instanceof Annotation) {
- copy.setFeatureValue(sofaFeature, goldView.getSofa());
- }
- copy.addToIndexes(goldView);
- }
- }
- }
- }
-
- /*
- * The following class overrides a ClearTK utility annotator class for reading
- * a text file into a JCas. The code is copy/pasted so that one tiny modification
- * can be made for this corpus -- replace a single odd character (0xc) with a
- * space since it trips up xml output.
- */
- public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- URI uri = ViewUriUtil.getURI(jCas);
- String content;
-
- try {
- content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
- content = content.replace((char) 0xc, ' ');
- jCas.setSofaDataString(content, "text/plain");
- } catch (MalformedURLException e) {
- throw new AnalysisEngineProcessException(e);
- } catch (IOException e) {
- throw new AnalysisEngineProcessException(e);
- }
- }
- }
-
- public static class WriteI2B2XML extends JCasAnnotator_ImplBase {
- public static final String PARAM_OUTPUT_DIR="PARAM_OUTPUT_DIR";
- @ConfigurationParameter(mandatory=true,description="Output directory to write xml files to.",name=PARAM_OUTPUT_DIR)
- protected String outputDir;
-
- @Override
- public void process(JCas jcas) throws AnalysisEngineProcessException {
- try {
- // get the output file name from the input file name and output directory.
- File outDir = new File(outputDir);
- if(!outDir.exists()) outDir.mkdirs();
- File inFile = new File(ViewUriUtil.getURI(jcas));
- String outFile = inFile.getName().replace(".txt", "");
-
- // build the xml
- DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
- DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
- Document doc = docBuilder.newDocument();
- Element rootElement = doc.createElement("ClinicalNarrativeTemporalAnnotation");
- Element textElement = doc.createElement("TEXT");
- Element tagsElement = doc.createElement("TAGS");
- textElement.setTextContent(jcas.getDocumentText());
- rootElement.appendChild(textElement);
- rootElement.appendChild(tagsElement);
- doc.appendChild(rootElement);
-
- Map<IdentifiedAnnotation,String> argToId = new HashMap<>();
- int id=0;
- for(TimeMention timex : JCasUtil.select(jcas, TimeMention.class)){
- Element timexElement = doc.createElement("TIMEX3");
- String timexID = "T"+id; id++;
- argToId.put(timex, timexID);
- timexElement.setAttribute("id", timexID);
- timexElement.setAttribute("start", String.valueOf(timex.getBegin()+1));
- timexElement.setAttribute("end", String.valueOf(timex.getEnd()+1));
- timexElement.setAttribute("text", timex.getCoveredText());
- timexElement.setAttribute("type", "NA");
- timexElement.setAttribute("val", "NA");
- timexElement.setAttribute("mod", "NA");
- tagsElement.appendChild(timexElement);
- }
-
- id = 0;
- for(EventMention event : JCasUtil.select(jcas, EventMention.class)){
- if (event.getClass().equals(EventMention.class)) {
- // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events
- Element eventEl = doc.createElement("EVENT");
- String eventID = "E"+id; id++;
- argToId.put(event, eventID);
- eventEl.setAttribute("id", eventID);
- eventEl.setAttribute("start", String.valueOf(event.getBegin()+1));
- eventEl.setAttribute("end", String.valueOf(event.getEnd()+1));
- eventEl.setAttribute("text", event.getCoveredText());
- eventEl.setAttribute("modality", "NA");
- eventEl.setAttribute("polarity", "NA");
- eventEl.setAttribute("type", "NA");
- tagsElement.appendChild(eventEl);
- }
- }
-
- id = 0;
- for(TemporalTextRelation rel : JCasUtil.select(jcas, TemporalTextRelation.class)){
- Element linkEl = doc.createElement("TLINK");
- String linkID = "TL"+id; id++;
- linkEl.setAttribute("id", linkID);
- Annotation arg1 = rel.getArg1().getArgument();
- linkEl.setAttribute("fromID", argToId.get(arg1));
- linkEl.setAttribute("fromText", arg1.getCoveredText());
- Annotation arg2 = rel.getArg2().getArgument();
- if(arg2!=null){
- linkEl.setAttribute("toID", argToId.get(arg2));
- linkEl.setAttribute("toText", arg2.getCoveredText());
- }else{
- linkEl.setAttribute("toID", "Discharge");
- linkEl.setAttribute("toText", "Discharge");
- }
- linkEl.setAttribute("type", rel.getCategory());
- tagsElement.appendChild(linkEl);
- }
-
- // boilerplate xml-writing code:
- TransformerFactory transformerFactory = TransformerFactory.newInstance();
- Transformer transformer = transformerFactory.newTransformer();
- transformer.setOutputProperty(OutputKeys.INDENT, "yes");
- transformer.setOutputProperty(OutputKeys.METHOD, "xml");
- DOMSource source = new DOMSource(doc);
- StreamResult result = new StreamResult(new File(outputDir, outFile));
- transformer.transform(source, result);
- } catch (ParserConfigurationException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- } catch (TransformerConfigurationException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- } catch (TransformerException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- }
-
- }
-
- }
-
- public static class WriteAnaforaXML extends JCasAnnotator_ImplBase {
- public static final String PARAM_OUTPUT_DIR="PARAM_OUTPUT_DIR";
- @ConfigurationParameter(mandatory=true,description="Output directory to write xml files to.",name=PARAM_OUTPUT_DIR)
- protected String outputDir;
-
- @Override
- public void process(JCas jcas) throws AnalysisEngineProcessException {
- try {
- // get the output file name from the input file name and output directory.
-
- File inFile = new File(ViewUriUtil.getURI(jcas));
- String outFile = inFile.getName().replace(".txt", "");
- File outDir = new File(outputDir, outFile);
- if(!outDir.exists()) outDir.mkdirs();
-
- // build the xml
- DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
- DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
- Document doc = docBuilder.newDocument();
-
- Element rootElement = doc.createElement("data");
-
- //info element
- Element infoElement = doc.createElement("info");
- Element saveTime = doc.createElement("savetime");
- saveTime.setTextContent("2015-0123-10:21");
- Element progress = doc.createElement("progress");
- progress.setTextContent("completed");
- infoElement.appendChild(saveTime);
- infoElement.appendChild(progress);
-
- //schema element
- Element schema = doc.createElement("schema");
- schema.setAttribute("path", "./");
- schema.setAttribute("protocol", "file");
- schema.setTextContent("temporal-schema.xml");
-
- Element annoElement = doc.createElement("annotations");
- Map<IdentifiedAnnotation,String> argToId = new HashMap<>();
- int id=1;
- for(EventMention event : JCasUtil.select(jcas, EventMention.class)){
- if (event.getClass().equals(EventMention.class)) {
- // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events
- Element eventEl = doc.createElement("entity");
- String eventID = id+"@e@"+outFile+"@system"; id++;
- argToId.put(event, eventID);
- Element idE = doc.createElement("id");
- idE.setTextContent(eventID);
- Element spanE = doc.createElement("span");
- spanE.setTextContent(String.valueOf(event.getBegin())+","+String.valueOf(event.getEnd()));
- Element typeE = doc.createElement("type");
- typeE.setTextContent("EVENT");
- Element parentTE = doc.createElement("parentsType");
- parentTE.setTextContent("TemporalEntities");
- //add properties
- Element property = doc.createElement("properties");
- Element docTimeRE = doc.createElement("DocTimeRel");
- docTimeRE.setTextContent(event.getEvent().getProperties().getDocTimeRel());
- Element eventTypeE = doc.createElement("Type");
- eventTypeE.setTextContent("N/A");
- Element degreeE = doc.createElement("Degree");
- degreeE.setTextContent("N/A");
- Element polarityE = doc.createElement("Polarity");
- String polarity = "UNKNOWN";
- int polarityInt = event.getPolarity();
- if(polarityInt == CONST.NE_POLARITY_NEGATION_ABSENT) polarity= "POS";
- else if(polarityInt == CONST.NE_POLARITY_NEGATION_PRESENT) polarity = "NEG";
- polarityE.setTextContent(polarity);
- Element ctexModE = doc.createElement("ContextualModality");
- ctexModE.setTextContent(event.getEvent().getProperties().getContextualModality());
- Element ctexAspE = doc.createElement("ContextualAspect");
- ctexAspE.setTextContent(event.getEvent().getProperties().getContextualAspect());
- Element permE = doc.createElement("Permanence");
- permE.setTextContent("UNDETERMINED");
- property.appendChild(docTimeRE);
- property.appendChild(polarityE);
- property.appendChild(degreeE);
- property.appendChild(eventTypeE);
- property.appendChild(ctexModE);
- property.appendChild(ctexAspE);
- property.appendChild(permE);
- eventEl.appendChild(idE);
- eventEl.appendChild(spanE);
- eventEl.appendChild(typeE);
- eventEl.appendChild(parentTE);
- eventEl.appendChild(property);
- annoElement.appendChild(eventEl);
- }
- }
- for(TimeMention timex : JCasUtil.select(jcas, TimeMention.class)){
- Element timexElement = doc.createElement("entity");
- String timexID = id+"@e@"+outFile+"@system"; id++;//18@e@ID006_clinic_016@gold
- argToId.put(timex, timexID);
- Element idE = doc.createElement("id");
- idE.setTextContent(timexID);
- Element spanE = doc.createElement("span");
- spanE.setTextContent(String.valueOf(timex.getBegin())+","+String.valueOf(timex.getEnd()));
- Element typeE = doc.createElement("type");
- Element parentTE = doc.createElement("parentsType");
- parentTE.setTextContent("TemporalEntities");
- //add properties
- Element property = doc.createElement("properties");
- String timeClass = timex.getTimeClass();
- if(timeClass.equals("DOCTIME")||timeClass.equals("SECTIONTIME")){
- typeE.setTextContent(timeClass);
- property.setTextContent("");
- }else{
- typeE.setTextContent("TIMEX3");
- Element classE = doc.createElement("Class");
- classE.setTextContent(timeClass);
- property.appendChild(classE);
- }
-
- timexElement.appendChild(idE);
- timexElement.appendChild(spanE);
- timexElement.appendChild(typeE);
- timexElement.appendChild(property);
- annoElement.appendChild(timexElement);
- }
-
-
-
- id = 1;
- for(TemporalTextRelation rel : JCasUtil.select(jcas, TemporalTextRelation.class)){
- Element linkEl = doc.createElement("relation");
- String linkID = id+"@r@"+outFile+"@system";id++;
-
- Element idE = doc.createElement("id");
- idE.setTextContent(linkID);
- Element typeE = doc.createElement("type");
- typeE.setTextContent("TLINK");
- Element parentTE = doc.createElement("parentsType");
- parentTE.setTextContent("TemporalRelations");
- //add properties
- Element property = doc.createElement("properties");
-
- Annotation arg1 = rel.getArg1().getArgument();
- Element sourceE = doc.createElement("Source");
- sourceE.setTextContent(argToId.get(arg1));
- Element relTypeE = doc.createElement("Type");
- relTypeE.setTextContent(rel.getCategory());
- Annotation arg2 = rel.getArg2().getArgument();
- Element targetE = doc.createElement("Target");
- targetE.setTextContent(argToId.get(arg2));
-
- property.appendChild(sourceE);
- property.appendChild(relTypeE);
- property.appendChild(targetE);
-
- linkEl.appendChild(idE);
- linkEl.appendChild(typeE);
- linkEl.appendChild(parentTE);
- linkEl.appendChild(property);
- annoElement.appendChild(linkEl);
- }
-
- rootElement.appendChild(infoElement);
- rootElement.appendChild(schema);
- rootElement.appendChild(annoElement);
- doc.appendChild(rootElement);
-
- // boilerplate xml-writing code:
- TransformerFactory transformerFactory = TransformerFactory.newInstance();
- Transformer transformer = transformerFactory.newTransformer();
- transformer.setOutputProperty(OutputKeys.INDENT, "yes");
- transformer.setOutputProperty(OutputKeys.METHOD, "xml");
- DOMSource source = new DOMSource(doc);
- StreamResult result = new StreamResult(new File(outDir, outFile+".xml"));
- transformer.transform(source, result);
- } catch (ParserConfigurationException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- } catch (TransformerConfigurationException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- } catch (TransformerException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- }
+ public static HashSet<String> badNotes;
+
+ public static final String GOLD_VIEW_NAME = "GoldView";
+
+ public enum XMLFormat {Knowtator, Anafora, I2B2}
+
+ public enum Subcorpus {Colon, Brain, DeepPhe}
+
+ public static interface Options {
+
+ @Option( longName = "text", defaultToNull = true )
+ public File getRawTextDirectory();
+
+ @Option( longName = "xml" )
+ public File getXMLDirectory();
+
+ @Option( longName = "format", defaultValue = "Anafora" )
+ public XMLFormat getXMLFormat();
+
+ @Option( longName = "subcorpus", defaultValue = "Colon" )
+ public Subcorpus getSubcorpus();
+
+ @Option( longName = "xmi" )
+ public File getXMIDirectory();
+
+ @Option( longName = "patients" )
+ public CommandLine.IntegerRanges getPatients();
+
+ @Option( longName = "train-remainders", defaultValue = "0-2" )
+ public CommandLine.IntegerRanges getTrainRemainders();
+
+ @Option( longName = "dev-remainders", defaultValue = "3" )
+ public CommandLine.IntegerRanges getDevRemainders();
+
+ @Option( longName = "test-remainders", defaultValue = "4-5" )
+ public CommandLine.IntegerRanges getTestRemainders();
+
+ @Option( longName = "treebank", defaultToNull = true )
+ public File getTreebankDirectory();
+
+ @Option
+ public boolean getUseGoldTrees();
+
+ @Option
+ public boolean getGrid();
+
+ @Option
+ public boolean getPrintErrors();
+
+ @Option
+ public boolean getPrintOverlappingSpans();
+
+ @Option
+ public boolean getTest();
+
+ @Option( longName = "kernelParams", defaultToNull = true )
+ public String getKernelParams();
+
+ @Option( defaultToNull = true )
+ public String getI2B2Output();
+ }
+
+ public static List<Integer> getTrainItems( Options options ) {
+ List<Integer> patientSets = options.getPatients().getList();
+ List<Integer> trainItems = THYMEData.getPatientSets( patientSets, options.getTrainRemainders().getList() );
+ if ( options.getTest() ) {
+ trainItems.addAll( THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() ) );
+ }
+ return trainItems;
+ }
+
+ public static List<Integer> getTestItems( Options options ) {
+ List<Integer> patientSets = options.getPatients().getList();
+ List<Integer> testItems;
+ if ( options.getTest() ) {
+ testItems = THYMEData.getPatientSets( patientSets, options.getTestRemainders().getList() );
+ } else {
+ testItems = THYMEData.getPatientSets( patientSets, options.getDevRemainders().getList() );
+ }
+ return testItems;
+ }
+
+ protected File rawTextDirectory;
+
+ protected File xmlDirectory;
+
+ protected XMLFormat xmlFormat;
+
+ protected Subcorpus subcorpus;
+
+ protected File xmiDirectory;
+
+ private boolean xmiExists;
+
+ protected File treebankDirectory;
+
+ protected boolean printErrors = false;
+
+ protected boolean printOverlapping = false;
+
+ protected String i2b2Output = null;
+
+ protected String[] kernelParams;
+
+ public Evaluation_ImplBase(
+ File baseDirectory,
+ File rawTextDirectory,
+ File xmlDirectory,
+ XMLFormat xmlFormat,
+ Subcorpus subcorpus,
+ File xmiDirectory,
+ File treebankDirectory ) {
+ super( baseDirectory );
+ this.rawTextDirectory = rawTextDirectory;
+ this.xmlDirectory = xmlDirectory;
+ this.xmlFormat = xmlFormat;
+ this.subcorpus = subcorpus;
+ this.xmiDirectory = xmiDirectory;
+ this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0;
+ this.treebankDirectory = treebankDirectory;
+
+ this.isTraining = true;
+ this.badNotes = new HashSet<>();
+ URL url = TimeWordsExtractor.class.getResource( LOOKUP_PATH );
+ try ( BufferedReader br = new BufferedReader( new FileReader( url.getFile() ) ) ) {
+ String line;
+ while ( (line = br.readLine()) != null ) {
+ badNotes.add( line.trim() );
+ }
+ } catch ( FileNotFoundException e ) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch ( IOException e ) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+
+ public void setI2B2Output( String outDir ) {
+ i2b2Output = outDir;
+ }
+
+ public void prepareXMIsFor( List<Integer> patientSets ) throws Exception {
+ boolean needsXMIs = false;
+ for ( File textFile : this.getFilesFor( patientSets ) ) {
+ if ( !getXMIFile( this.xmiDirectory, textFile ).exists() ) {
+ needsXMIs = true;
+ break;
+ }
+ }
+ if ( needsXMIs ) {
+ CollectionReader reader = this.getCollectionReader( patientSets );
+ AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+ SimplePipeline.runPipeline( reader, engine );
+ }
+ this.xmiExists = true;
+ }
+
+ private List<File> getFilesFor( List<Integer> patientSets ) throws FileNotFoundException {
+ List<File> files = new ArrayList<>();
+ if ( this.xmlFormat == XMLFormat.Anafora ) {
+ Set<String> ids = new HashSet<>();
+ for ( Integer set : patientSets ) {
+ if ( this.subcorpus == Subcorpus.Colon ) {
+ ids.add( String.format( "ID%03d", set ) );
+ } else if ( this.subcorpus == Subcorpus.DeepPhe ) {
+ ids.add( String.format( "patient%02d", set ) );
+ } else {
+ ids.add( String.format( "doc%04d", set ) );
+ }
+ }
+ int filePrefixLen = 5; // Colon: "ID\d{3}"
+ if ( this.subcorpus == Subcorpus.Brain ) {
+ filePrefixLen = 7; // Brain: "doc\d{4}"
+ } else if ( this.subcorpus == Subcorpus.DeepPhe ) {
+ filePrefixLen = 9; // deepPhe: "patient\d{2}"
+ }
+ if ( this.subcorpus == Subcorpus.DeepPhe ) {
+ for ( File dir : this.xmlDirectory.listFiles() ) {
+ if ( dir.isDirectory() ) {
+ if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) {
+ File file = new File( dir, dir.getName() );
+ if ( file.exists() ) {
+ files.add( file );
+ } else {
+ LOGGER.warn( "Missing note: " + file );
+ }
+ }
+ }
+ }
+ } else {
+ for ( String section : THYMEData.SECTIONS ) {
+ File xmlSubdir = new File( this.xmlDirectory, section );
+ for ( File dir : xmlSubdir.listFiles() ) {
+ if ( dir.isDirectory() ) {
+ if ( ids.contains( dir.getName().substring( 0, filePrefixLen ) ) ) {
+ File file = new File( dir, dir.getName() );
+ if ( file.exists() ) {
+ files.add( file );
+ } else {
+ LOGGER.warn( "Missing note: " + file );
+ }
+ }
+ }
+ }
+ }
+ }
+ } else if ( this.xmlFormat == XMLFormat.I2B2 ) {
+ File trainDir = new File( this.xmlDirectory, "training" );
+ File testDir = new File( this.xmlDirectory, "test" );
+ for ( Integer pt : patientSets ) {
+ File xmlTrain = new File( trainDir, pt + ".xml" );
+ File train = new File( trainDir, pt + ".xml.txt" );
+ if ( train.exists() ) {
+ if ( xmlTrain.exists() ) {
+ files.add( train );
+ } else {
+ System.err.println( "Text file in training has no corresponding xml -- skipping: " + train );
+ }
+ }
+ File xmlTest = new File( testDir, pt + ".xml" );
+ File test = new File( testDir, pt + ".xml.txt" );
+ if ( xmlTest.exists() ) {
+ if ( test.exists() ) {
+ files.add( test );
+ } else {
+ throw new FileNotFoundException( "Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set." );
+ }
+ }
+ assert !(train.exists() && test.exists());
+ }
+ } else if ( xmlFormat == XMLFormat.Knowtator ) {
+ LOGGER.warn( "This is an old annotation format -- please upgrade to using anafora files." );
+ for ( Integer set : patientSets ) {
+ final int setNum = set;
+ for ( File file : rawTextDirectory.listFiles( new FilenameFilter() {
+ @Override
+ public boolean accept( File dir, String name ) {
+ return name.contains( String.format( "ID%03d", setNum ) );
+ }
+ } ) ) {
+ // skip hidden files like .svn
+ if ( !file.isHidden() ) {
+ files.add( file );
+ }
+ }
+ }
+ } else {
+ LOGGER.error( "Unknown data format -- please specify Anafora, i2b2, or Knowtator format." );
+ }
+ return files;
+ }
+
+ @Override
+ protected CollectionReader getCollectionReader( List<Integer> patientSets ) throws Exception {
+ List<File> collectedFiles = this.getFilesFor( patientSets );
+ /**
+ if(isTraining){
+ final Collection<File> filesToRemove = new HashSet<>();
+ for ( File xmiFile : collectedFiles ) {
+ String fname = xmiFile.getName();
+ if(this.badNotes.contains(fname)){
+ LOGGER.error("Find Bad XMI file: "+fname);
+ filesToRemove.add( xmiFile );
+ }
+ }
+ collectedFiles.removeAll( filesToRemove );
+ }
+ isTraining = false;
+ */
+ return UriCollectionReader.getCollectionReaderFromFiles( collectedFiles );
+ }
+
+ protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception {
+ return this.xmiExists
+ ? this.getXMIReadingPreprocessorAggregateBuilder()
+ : this.getXMIWritingPreprocessorAggregateBuilder();
+ }
+
+ protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException {
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ aggregateBuilder.add( UriToDocumentTextAnnotator.getDescription() );
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ XMIReader.class,
+ XMIReader.PARAM_XMI_DIRECTORY,
+ this.xmiDirectory ) );
+ return aggregateBuilder;
+ }
+
+ protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+ throws Exception {
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( UriToDocumentTextAnnotatorCtakes.class ) );
+
+ // read manual annotations into gold view
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewCreatorAnnotator.class,
+ ViewCreatorAnnotator.PARAM_VIEW_NAME,
+ GOLD_VIEW_NAME ) );
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewTextCopierAnnotator.class,
+ ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+ CAS.NAME_DEFAULT_SOFA,
+ ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+ GOLD_VIEW_NAME ) );
+ switch ( this.xmlFormat ) {
+ case Anafora:
+ aggregateBuilder.add(
+ THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+ break;
+ case Knowtator:
+ aggregateBuilder.add(
+ THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+ break;
+ case I2B2:
+ aggregateBuilder.add(
+ I2B2TemporalXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+ break;
+ }
+
+ // identify segments
+ aggregateBuilder
+ .add( AnalysisEngineFactory.createEngineDescription( SegmentsFromBracketedSectionTagsAnnotator.class ) );
+
+ // identify sentences
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ SentenceDetector.class,
+ SentenceDetector.SD_MODEL_FILE_PARAM,
+ "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) );
+ // identify tokens
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) );
+ // merge some tokens
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ContextDependentTokenizerAnnotator.class ) );
+
+ // identify part-of-speech tags
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ POSTagger.class,
+ TypeSystemDescriptionFactory.createTypeSystemDescription(),
+ TypePrioritiesFactory.createTypePriorities( Segment.class, Sentence.class, BaseToken.class ),
+ POSTagger.POS_MODEL_FILE_PARAM,
+ "org/apache/ctakes/postagger/models/mayo-pos.zip" ) );
+
+ // identify chunks
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ Chunker.class,
+ Chunker.CHUNKER_MODEL_FILE_PARAM,
+ FileLocator.locateFile( "org/apache/ctakes/chunker/models/chunker-model.zip" ),
+ Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+ DefaultChunkCreator.class ) );
+
+ // identify UMLS named entities
+
+ // adjust NP in NP NP to span both
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ChunkAdjuster.class,
+ ChunkAdjuster.PARAM_CHUNK_PATTERN,
+ new String[] { "NP", "NP" },
+ ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+ 1 ) );
+ // adjust NP in NP PP NP to span all three
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ChunkAdjuster.class,
+ ChunkAdjuster.PARAM_CHUNK_PATTERN,
+ new String[] { "NP", "PP", "NP" },
+ ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+ 2 ) );
+ // add lookup windows for each NP
+ aggregateBuilder
+ .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
+ // maximize lookup windows
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ OverlapAnnotator.class,
+ "A_ObjectClass",
+ LookupWindowAnnotation.class,
+ "B_ObjectClass",
+ LookupWindowAnnotation.class,
+ "OverlapType",
+ "A_ENV_B",
+ "ActionType",
+ "DELETE",
+ "DeleteAction",
+ new String[] { "selector=B" } ) );
+ // add UMLS on top of lookup windows
+ aggregateBuilder.add( DefaultJCasTermAnnotator.createAnnotatorDescription() );
+
+ aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() );
+
+ // add dependency parser
+ aggregateBuilder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+
+ // add semantic role labeler
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) );
+
+ // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
+ if ( this.treebankDirectory != null ) {
+ aggregateBuilder.add( THYMETreebankReader.getDescription( this.treebankDirectory ) );
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) );
+ } else {
+ // add ctakes constituency parses to system view
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class,
+ ConstituencyParser.PARAM_MODEL_FILENAME,
+ "org/apache/ctakes/constituency/parser/models/thyme.bin" ) );
+ }
+ // write out the CAS after all the above annotations
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ XMIWriter.class,
+ XMIWriter.PARAM_XMI_DIRECTORY,
+ this.xmiDirectory ) );
+
+ return aggregateBuilder;
+ }
+
+ public static <T extends Annotation> List<T> selectExact( JCas jCas, Class<T> annotationClass, Segment segment ) {
+ List<T> annotations = Lists.newArrayList();
+ for ( T annotation : JCasUtil.selectCovered( jCas, annotationClass, segment ) ) {
+ if ( annotation.getClass().equals( annotationClass ) ) {
+ annotations.add( annotation );
+ }
+ }
+ return annotations;
+ }
+
+ public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class ) ) {
+ if ( chunk.getChunkType().equals( "NP" ) ) {
+ new LookupWindowAnnotation( jCas, chunk.getBegin(), chunk.getEnd() ).addToIndexes();
+ }
+ }
+ }
+ }
+
+ public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ List<LookupWindowAnnotation> lws = new ArrayList<>( JCasUtil.select( jCas, LookupWindowAnnotation.class ) );
+ // we'll navigate backwards so that as we delete things we shorten the list from the back
+ for ( int i = lws.size() - 2; i >= 0; i-- ) {
+ LookupWindowAnnotation lw1 = lws.get( i );
+ LookupWindowAnnotation lw2 = lws.get( i + 1 );
+ if ( lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd() ) {
+ /// lw1 envelops or encloses lw2
+ lws.remove( i + 1 );
+ lw2.removeFromIndexes();
+ }
+ }
+
+ }
+
+ }
+
+ public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ for ( EntityMention mention : Lists.newArrayList( JCasUtil.select( jCas, EntityMention.class ) ) ) {
+ mention.removeFromIndexes();
+ }
+ }
+ }
+
+ public static class EventMentionRemover extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ for ( EventMention mention : Lists.newArrayList( JCasUtil.select( jCas, EventMention.class ) ) ) {
+ mention.removeFromIndexes();
+ }
+ }
+ }
+
+ // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed
+ public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase {
+ private static Pattern SECTION_PATTERN = Pattern.compile(
+ "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])",
+ Pattern.DOTALL );
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ boolean foundSections = false;
+ Matcher matcher = SECTION_PATTERN.matcher( jCas.getDocumentText() );
+ while ( matcher.find() ) {
+ Segment segment = new Segment( jCas );
+ segment.setBegin( matcher.start() + matcher.group( 1 ).length() );
+ segment.setEnd( matcher.end() - matcher.group( 3 ).length() );
+ segment.setId( matcher.group( 2 ) );
+ segment.addToIndexes();
+ foundSections = true;
+ }
+ if ( !foundSections ) {
+ Segment segment = new Segment( jCas );
+ segment.setBegin( 0 );
+ segment.setEnd( jCas.getDocumentText().length() );
+ segment.setId( "SIMPLE_SEGMENT" );
+ segment.addToIndexes();
+ }
+ }
+ }
+
+ static File getXMIFile( File xmiDirectory, File textFile ) {
+ return new File( xmiDirectory, textFile.getName() + ".xmi" );
+ }
+
+ static File getXMIFile( File xmiDirectory, JCas jCas ) throws AnalysisEngineProcessException {
+ return getXMIFile( xmiDirectory, new File( ViewUriUtil.getURI( jCas ).getPath() ) );
+ }
+
+ public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+ public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+ @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true )
+ private File xmiDirectory;
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ if ( !this.xmiDirectory.exists() ) {
+ this.xmiDirectory.mkdirs();
+ }
+ }
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ File xmiFile = getXMIFile( this.xmiDirectory, jCas );
+ try {
+ FileOutputStream outputStream = new FileOutputStream( xmiFile );
+ try {
+ XmiCasSerializer serializer = new XmiCasSerializer( jCas.getTypeSystem() );
+ ContentHandler handler = new XMLSerializer( outputStream, false ).getContentHandler();
+ serializer.serialize( jCas.getCas(), handler );
+ } finally {
+ outputStream.close();
+ }
+ } catch ( SAXException e ) {
+ throw new AnalysisEngineProcessException( e );
+ } catch ( IOException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ }
+ }
+
+ public static class XMIReader extends JCasAnnotator_ImplBase {
+
+ public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+ @ConfigurationParameter( name = PARAM_XMI_DIRECTORY, mandatory = true )
+ private File xmiDirectory;
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ File xmiFile = getXMIFile( this.xmiDirectory, jCas );
+ try {
+ FileInputStream inputStream = new FileInputStream( xmiFile );
+ try {
+ XmiCasDeserializer.deserialize( inputStream, jCas.getCas() );
+ } finally {
+ inputStream.close();
+ }
+ } catch ( SAXException e ) {
+ throw new AnalysisEngineProcessException( e );
+ } catch ( IOException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ }
+ }
+
+ public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase {
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ JCas goldView, systemView;
+ try {
+ goldView = jCas.getView( GOLD_VIEW_NAME );
+ systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
+ } catch ( CASException e ) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException();
+ }
+ for ( TimeMention mention : JCasUtil.select( goldView, TimeMention.class ) ) {
+ // for each time expression, get the treebank node with the same span.
+ List<TreebankNode> nodes = JCasUtil.selectCovered( systemView, TreebankNode.class, mention );
+ TreebankNode sameSpanNode = null;
+ for ( TreebankNode node : nodes ) {
+ if ( node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd() ) {
+ sameSpanNode = node;
+ break;
+ }
+ }
+ if ( sameSpanNode != null ) {
+ // look at node at the position of the timex3.
+ if ( sameSpanNode.getNodeType().equals( "PP" ) ) {
+ // if it is a PP it should be moved down to the NP
+ int numChildren = sameSpanNode.getChildren().size();
+ if ( numChildren == 2 && sameSpanNode.getChildren( 0 ).getNodeType().equals( "IN" ) &&
+ sameSpanNode.getChildren( 1 ).getNodeType().equals( "NP" ) ) {
+ // move the time span to this node:
+ TreebankNode mentionNode = sameSpanNode.getChildren( numChildren - 1 );
+ mention.setBegin( mentionNode.getBegin() );
+ mention.setEnd( mentionNode.getEnd() );
+ }
+ }
+ } else {
+ // if there is no matching tree span, see if the DT to the left would help.
+ // now adjust for missing DT to the left
+ List<TerminalTreebankNode> precedingPreterms = JCasUtil
+ .selectPreceding( systemView, TerminalTreebankNode.class, mention, 1 );
+ if ( precedingPreterms != null && precedingPreterms.size() == 1 ) {
+ TerminalTreebankNode leftTerm = precedingPreterms.get( 0 );
+ if ( leftTerm.getNodeType().equals( "DT" ) ) {
+ // now see if adding this would make it match a tree
+ List<TreebankNode> matchingNodes = JCasUtil
+ .selectCovered( systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd() );
+ for ( TreebankNode node : matchingNodes ) {
+ if ( node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd() ) {
+ sameSpanNode = node;
+ break;
+ }
+ }
+ if ( sameSpanNode != null ) {
+ // adding the DT to the left of th emention made it match a tree:
+ System.err.println(
+ "Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText() );
+ mention.setBegin( leftTerm.getBegin() );
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+
+ public static class CopyFromGold extends JCasAnnotator_ImplBase {
+
+ public static AnalysisEngineDescription getDescription( Class<?>... classes )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ CopyFromGold.class,
+ CopyFromGold.PARAM_ANNOTATION_CLASSES,
+ classes );
+ }
+
+ public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";
+
+ @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true )
+ private Class<? extends TOP>[] annotationClasses;
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ JCas goldView, systemView;
+ try {
+ goldView = jCas.getView( GOLD_VIEW_NAME );
+ systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
+ } catch ( CASException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ for ( Class<? extends TOP> annotationClass : this.annotationClasses ) {
+ for ( TOP annotation : Lists.newArrayList( JCasUtil.select( systemView, annotationClass ) ) ) {
+ if ( annotation.getClass().equals( annotationClass ) ) {
+ annotation.removeFromIndexes();
+ }
+ }
+ }
+ CasCopier copier = new CasCopier( goldView.getCas(), systemView.getCas() );
+ Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA );
+ for ( Class<? extends TOP> annotationClass : this.annotationClasses ) {
+ for ( TOP annotation : JCasUtil.select( goldView, annotationClass ) ) {
+ TOP copy = (TOP)copier.copyFs( annotation );
+ if ( copy instanceof Annotation ) {
+ copy.setFeatureValue( sofaFeature, systemView.getSofa() );
+ }
+ copy.addToIndexes( systemView );
+ }
+ }
+ }
+ }
+
+ public static class CopyFromSystem extends JCasAnnotator_ImplBase {
+
+ public static AnalysisEngineDescription getDescription( Class<?>... classes )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ CopyFromSystem.class,
+ CopyFromSystem.PARAM_ANNOTATION_CLASSES,
+ classes );
+ }
+
+ public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";
+
+ @ConfigurationParameter( name = PARAM_ANNOTATION_CLASSES, mandatory = true )
+ private Class<? extends TOP>[] annotationClasses;
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ JCas goldView, systemView;
+ try {
+ goldView = jCas.getView( GOLD_VIEW_NAME );
+ systemView = jCas.getView( CAS.NAME_DEFAULT_SOFA );
+ } catch ( CASException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ for ( Class<? extends TOP> annotationClass : this.annotationClasses ) {
+ for ( TOP annotation : Lists.newArrayList( JCasUtil.select( goldView, annotationClass ) ) ) {
+ if ( annotation.getClass().equals( annotationClass ) ) {
+ annotation.removeFromIndexes();
+ }
+ }
+ }
+ CasCopier copier = new CasCopier( systemView.getCas(), goldView.getCas() );
+ Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName( CAS.FEATURE_FULL_NAME_SOFA );
+ for ( Class<? extends TOP> annotationClass : this.annotationClasses ) {
+ for ( TOP annotation : JCasUtil.select( systemView, annotationClass ) ) {
+ TOP copy = (TOP)copier.copyFs( annotation );
+ if ( copy instanceof Annotation ) {
+ copy.setFeatureValue( sofaFeature, goldView.getSofa() );
+ }
+ copy.addToIndexes( goldView );
+ }
+ }
+ }
+ }
+
+ /*
+ * The following class overrides a ClearTK utility annotator class for reading
+ * a text file into a JCas. The code is copy/pasted so that one tiny modification
+ * can be made for this corpus -- replace a single odd character (0xc) with a
+ * space since it trips up xml output.
+ */
+ public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ URI uri = ViewUriUtil.getURI( jCas );
+ String content;
+
+ try {
+ content = CharStreams.toString( new InputStreamReader( uri.toURL().openStream() ) );
+ content = content.replace( (char)0xc, ' ' );
+ jCas.setSofaDataString( content, "text/plain" );
+ } catch ( MalformedURLException e ) {
+ throw new AnalysisEngineProcessException( e );
+ } catch ( IOException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ }
+ }
+
+ public static class WriteI2B2XML extends JCasAnnotator_ImplBase {
+ public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR";
+ @ConfigurationParameter( mandatory = true, description = "Output directory to write xml files to.", name = PARAM_OUTPUT_DIR )
+ protected String outputDir;
+
+ @Override
+ public void process( JCas jcas ) throws AnalysisEngineProcessException {
+ try {
+ // get the output file name from the input file name and output directory.
+ File outDir = new File( outputDir );
+ if ( !outDir.exists() ) {
+ outDir.mkdirs();
+ }
+ File inFile = new File( ViewUriUtil.getURI( jcas ) );
+ String outFile = inFile.getName().replace( ".txt", "" );
+
+ // build the xml
+ DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ Document doc = docBuilder.newDocument();
+ Element rootElement = doc.createElement( "ClinicalNarrativeTemporalAnnotation" );
+ Element textElement = doc.createElement( "TEXT" );
+ Element tagsElement = doc.createElement( "TAGS" );
+ textElement.setTextContent( jcas.getDocumentText() );
+ rootElement.appendChild( textElement );
+ rootElement.appendChild( tagsElement );
+ doc.appendChild( rootElement );
+
+ Map<IdentifiedAnnotation, String> argToId = new HashMap<>();
+ int id = 0;
+ for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class ) ) {
+ Element timexElement = doc.createElement( "TIMEX3" );
+ String timexID = "T" + id;
+ id++;
+ argToId.put( timex, timexID );
+ timexElement.setAttribute( "id", timexID );
+ timexElement.setAttribute( "start", String.valueOf( timex.getBegin() + 1 ) );
+ timexElement.setAttribute( "end", String.valueOf( timex.getEnd() + 1 ) );
+ timexElement.setAttribute( "text", timex.getCoveredText() );
+ timexElement.setAttribute( "type", "NA" );
+ timexElement.setAttribute( "val", "NA" );
+ timexElement.setAttribute( "mod", "NA" );
+ tagsElement.appendChild( timexElement );
+ }
+
+ id = 0;
+ for ( EventMention event : JCasUtil.select( jcas, EventMention.class ) ) {
+ if ( event.getClass().equals( EventMention.class ) ) {
+ // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events
+ Element eventEl = doc.createElement( "EVENT" );
+ String eventID = "E" + id;
+ id++;
+ argToId.put( event, eventID );
+ eventEl.setAttribute( "id", eventID );
+ eventEl.setAttribute( "start", String.valueOf( event.getBegin() + 1 ) );
+ eventEl.setAttribute( "end", String.valueOf( event.getEnd() + 1 ) );
+ eventEl.setAttribute( "text", event.getCoveredText() );
+ eventEl.setAttribute( "modality", "NA" );
+ eventEl.setAttribute( "polarity", "NA" );
+ eventEl.setAttribute( "type", "NA" );
+ tagsElement.appendChild( eventEl );
+ }
+ }
+
+ id = 0;
+ for ( TemporalTextRelation rel : JCasUtil.select( jcas, TemporalTextRelation.class ) ) {
+ Element linkEl = doc.createElement( "TLINK" );
+ String linkID = "TL" + id;
+ id++;
+ linkEl.setAttribute( "id", linkID );
+ Annotation arg1 = rel.getArg1().getArgument();
+ linkEl.setAttribute( "fromID", argToId.get( arg1 ) );
+ linkEl.setAttribute( "fromText", arg1.getCoveredText() );
+ Annotation arg2 = rel.getArg2().getArgument();
+ if ( arg2 != null ) {
+ linkEl.setAttribute( "toID", argToId.get( arg2 ) );
+ linkEl.setAttribute( "toText", arg2.getCoveredText() );
+ } else {
+ linkEl.setAttribute( "toID", "Discharge" );
+ linkEl.setAttribute( "toText", "Discharge" );
+ }
+ linkEl.setAttribute( "type", rel.getCategory() );
+ tagsElement.appendChild( linkEl );
+ }
+
+ // boilerplate xml-writing code:
+ TransformerFactory transformerFactory = TransformerFactory.newInstance();
+ Transformer transformer = transformerFactory.newTransformer();
+ transformer.setOutputProperty( OutputKeys.INDENT, "yes" );
+ transformer.setOutputProperty( OutputKeys.METHOD, "xml" );
+ DOMSource source = new DOMSource( doc );
+ StreamResult result = new StreamResult( new File( outputDir, outFile ) );
+ transformer.transform( source, result );
+ } catch ( ParserConfigurationException e ) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException( e );
+ } catch ( TransformerConfigurationException e ) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException( e );
+ } catch ( TransformerException e ) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException( e );
+ }
+
+ }
+
+ }
+
+ public static class WriteAnaforaXML extends JCasAnnotator_ImplBase {
+ public static final String PARAM_OUTPUT_DIR = "PARAM_OUTPUT_DIR";
+ @ConfigurationParameter( mandatory = true, description = "Output directory to write xml files to.", name = PARAM_OUTPUT_DIR )
+ protected String outputDir;
+
+ @Override
+ public void process( JCas jcas ) throws AnalysisEngineProcessException {
+ try {
+ // get the output file name from the input file name and output directory.
+
+ File inFile = new File( ViewUriUtil.getURI( jcas ) );
+ String outFile = inFile.getName().replace( ".txt", "" );
+ File outDir = new File( outputDir, outFile );
+ if ( !outDir.exists() ) {
+ outDir.mkdirs();
+ }
+
+ // build the xml
+ DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+ Document doc = docBuilder.newDocument();
+
+ Element rootElement = doc.createElement( "data" );
+
+ //info element
+ Element infoElement = doc.createElement( "info" );
+ Element saveTime = doc.createElement( "savetime" );
+ saveTime.setTextContent( "2015-0123-10:21" );
+ Element progress = doc.createElement( "progress" );
+ progress.setTextContent( "completed" );
+ infoElement.appendChild( saveTime );
+ infoElement.appendChild( progress );
+
+ //schema element
+ Element schema = doc.createElement( "schema" );
+ schema.setAttribute( "path", "./" );
+ schema.setAttribute( "protocol", "file" );
+ schema.setTextContent( "temporal-schema.xml" );
+
+ Element annoElement = doc.createElement( "annotations" );
+ Map<IdentifiedAnnotation, String> argToId = new HashMap<>();
+ int id = 1;
+ for ( EventMention event : JCasUtil.select( jcas, EventMention.class ) ) {
+ if ( event.getClass().equals( EventMention.class ) ) {
+ // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events
+ Element eventEl = doc.createElement( "entity" );
+ String eventID = id + "@e@" + outFile + "@system";
+ id++;
+ argToId.put( event, eventID );
+ Element idE = doc.createElement( "id" );
+ idE.setTextContent( eventID );
+ Element spanE = doc.createElement( "span" );
+ spanE.setTextContent( String.valueOf( event.getBegin() ) + "," + String.valueOf( event.getEnd() ) );
+ Element typeE = doc.createElement( "type" );
+ typeE.setTextContent( "EVENT" );
+ Element parentTE = doc.createElement( "parentsType" );
+ parentTE.setTextContent( "TemporalEntities" );
+ //add properties
+ Element property = doc.createElement( "properties" );
+ Element docTimeRE = doc.createElement( "DocTimeRel" );
+ docTimeRE.setTextContent( event.getEvent().getProperties().getDocTimeRel() );
+ Element eventTypeE = doc.createElement( "Type" );
+ eventTypeE.setTextContent( "N/A" );
+ Element degreeE = doc.createElement( "Degree" );
+ degreeE.setTextContent( "N/A" );
+ Element polarityE = doc.createElement( "Polarity" );
+ String polarity = "UNKNOWN";
+ int polarityInt = event.getPolarity();
+ if ( polarityInt == CONST.NE_POLARITY_NEGATION_ABSENT ) {
+ polarity = "POS";
+ } else if ( polarityInt == CONST.NE_POLARITY_NEGATION_PRESENT ) {
+ polarity = "NEG";
+ }
+ polarityE.setTextContent( polarity );
+ Element ctexModE = doc.createElement( "ContextualModality" );
+ ctexModE.setTextContent( event.getEvent().getProperties().getContextualModality() );
+ Element ctexAspE = doc.createElement( "ContextualAspect" );
+ ctexAspE.setTextContent( event.getEvent().getProperties().getContextualAspect() );
+ Element permE = doc.createElement( "Permanence" );
+ permE.setTextContent( "UNDETERMINED" );
+ property.appendChild( docTimeRE );
+ property.appendChild( polarityE );
+ property.appendChild( degreeE );
+ property.appendChild( eventTypeE );
+ property.appendChild( ctexModE );
+ property.appendChild( ctexAspE );
+ property.appendChild( permE );
+ eventEl.appendChild( idE );
+ eventEl.appendChild( spanE );
+ eventEl.appendChild( typeE );
+ eventEl.appendChild( parentTE );
+ eventEl.appendChild( property );
+ annoElement.appendChild( eventEl );
+ }
+ }
+ for ( TimeMention timex : JCasUtil.select( jcas, TimeMention.class ) ) {
+ Element timexElement = doc.createElement( "entity" );
+ String timexID = id + "@e@" + outFile + "@system";
+ id++;//18@e@ID006_clinic_016@gold
+ argToId.put( timex, timexID );
+ Element idE = doc.createElement( "id" );
+ idE.setTextContent( timexID );
+ Element spanE = doc.createElement( "span" );
+ spanE.setTextContent( String.valueOf( timex.getBegin() ) + "," + String.valueOf( timex.getEnd() ) );
+ Element typeE = doc.createElement( "type" );
+ Element parentTE = doc.createElement( "parentsType" );
+ parentTE.setTextContent( "TemporalEntities" );
+ //add properties
+ Element property = doc.createElement( "properties" );
+ String timeClass = timex.getTimeClass();
+ if ( timeClass.equals( "DOCTIME" ) || timeClass.equals( "SECTIONTIME" ) ) {
+ typeE.setTextContent( timeClass );
+ property.setTextContent( "" );
+ } else {
+ typeE.setTextContent( "TIMEX3" );
+ Element classE = doc.createElement( "Class" );
+ classE.setTextContent( timeClass );
+ property.appendChild( classE );
+ }
+
+ timexElement.appendChild( idE );
+ timexElement.appendChild( spanE );
+ timexElement.appendChild( typeE );
+ timexElement.appendChild( property );
+ annoElement.appendChild( timexElement );
+ }
+
+
+ id = 1;
+ for ( TemporalTextRelation rel : JCasUtil.select( jcas, TemporalTextRelation.class ) ) {
+ Element linkEl = doc.createElement( "relation" );
+ String linkID = id + "@r@" + outFile + "@system";
+ id++;
+
+ Element idE = doc.createElement( "id" );
+ idE.setTextContent( linkID );
+ Element typeE = doc.createElement( "type" );
+ typeE.setTextContent( "TLINK" );
+ Element parentTE = doc.createElement( "parentsType" );
+ parentTE.setTextContent( "TemporalRelations" );
+ //add properties
+ Element property = doc.createElement( "properties" );
+
+ Annotation arg1 = rel.getArg1().getArgument();
+ Element sourceE = doc.createElement( "Source" );
+ sourceE.setTextContent( argToId.get( arg1 ) );
+ Element relTypeE = doc.createElement( "Type" );
+ relTypeE.setTextContent( rel.getCategory() );
+ Annotation arg2 = rel.getArg2().getArgument();
+ Element targetE = doc.createElement( "Target" );
+ targetE.setTextContent( argToId.get( arg2 ) );
+
+ property.appendChild( sourceE );
[... 41 lines stripped ...]