You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2017/08/29 20:29:52 UTC
svn commit: r1806645 -
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Author: tmill
Date: Tue Aug 29 20:29:52 2017
New Revision: 1806645
URL: http://svn.apache.org/viewvc?rev=1806645&view=rev
Log:
Add new eval interface that multi-doc coref uses.
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1806645&r1=1806644&r2=1806645&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Tue Aug 29 20:29:52 2017
@@ -18,9 +18,41 @@
*/
package org.apache.ctakes.temporal.eval;
-import com.google.common.collect.Lists;
-import com.google.common.io.CharStreams;
-import com.lexicalscope.jewel.cli.Option;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.commons.io.FileUtils;
import org.apache.ctakes.chunker.ae.Chunker;
import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
@@ -41,6 +73,7 @@ import org.apache.ctakes.temporal.ae.THY
import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
import org.apache.ctakes.temporal.ae.THYMETreebankReader;
import org.apache.ctakes.temporal.duration.Utils;
+import org.apache.ctakes.temporal.utils.PatientViewsUtil;
import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
@@ -91,19 +124,9 @@ import org.w3c.dom.Element;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.transform.*;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import java.io.*;
-import java.net.MalformedURLException;
-import java.net.URI;
-import java.net.URL;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import com.google.common.collect.Lists;
+import com.google.common.io.CharStreams;
+import com.lexicalscope.jewel.cli.Option;
//import org.apache.ctakes.core.cleartk.ae.SentenceDetectorAnnotator;
//import org.threeten.bp.temporal.TemporalUnit;
@@ -122,8 +145,10 @@ org.cleartk.eval.Evaluation_ImplBase<Int
public static final String GOLD_VIEW_NAME = "GoldView";
public static final String PROB_VIEW_NAME = "ProbView";
+
+ public static final int MAX_DOC_VIEWS = 3;
- public enum XMLFormat {Knowtator, Anafora, I2B2}
+ public enum XMLFormat {Knowtator, Anafora, I2B2, AnaforaCoref}
public enum Subcorpus {Colon, Brain, DeepPhe}
@@ -346,6 +371,22 @@ org.cleartk.eval.Evaluation_ImplBase<Int
}
}
}
+ } else if ( this.xmlFormat == XMLFormat.AnaforaCoref){
+ Set<String> ids = new HashSet<>();
+ for ( Integer set : patientSets ) {
+ if ( this.subcorpus == Subcorpus.Colon ) {
+ ids.add( String.format( "ID%03d", set ) );
+ } else {
+ LOGGER.warn("No coreference annotations exist for this corpus!");
+ }
+ }
+ for(File dir : this.xmlDirectory.listFiles() ){
+ if(dir.isDirectory()){
+ if(ids.contains(dir.getName())){
+ files.add(dir);
+ }
+ }
+ }
} else if ( this.xmlFormat == XMLFormat.I2B2 ) {
File trainDir = new File( this.xmlDirectory, "training" );
File testDir = new File( this.xmlDirectory, "test" );
@@ -424,6 +465,7 @@ org.cleartk.eval.Evaluation_ImplBase<Int
protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException {
AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ // TODO: Is this necessary? Doesn't the default view have the text populated in the xmis?
aggregateBuilder.add( UriToDocumentTextAnnotator.getDescription() );
aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
XMIReader.class,
@@ -436,137 +478,9 @@ org.cleartk.eval.Evaluation_ImplBase<Int
throws Exception {
AggregateBuilder aggregateBuilder = new AggregateBuilder();
aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( UriToDocumentTextAnnotatorCtakes.class ) );
-
- // read manual annotations into gold view
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- ViewCreatorAnnotator.class,
- ViewCreatorAnnotator.PARAM_VIEW_NAME,
- GOLD_VIEW_NAME ) );
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- ViewTextCopierAnnotator.class,
- ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
- CAS.NAME_DEFAULT_SOFA,
- ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
- GOLD_VIEW_NAME ) );
- switch ( this.xmlFormat ) {
- case Anafora:
- if(this.subcorpus == Subcorpus.DeepPhe){
- aggregateBuilder.add(
- AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class,
- THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
- this.xmlDirectory,
- THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES,
- new String[]{} ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
- }else{
- aggregateBuilder.add(
- THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
- }
- break;
- case Knowtator:
- aggregateBuilder.add(
- THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
- break;
- case I2B2:
- aggregateBuilder.add(
- I2B2TemporalXMLReader.getDescription( this.xmlDirectory ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
- break;
- }
-
- // identify segments
- if(this.subcorpus == Subcorpus.DeepPhe){
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class));
- }else{
- aggregateBuilder
- .add( AnalysisEngineFactory.createEngineDescription( SegmentsFromBracketedSectionTagsAnnotator.class ) );
- }
- // identify sentences
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- SentenceDetector.class,
- SentenceDetector.SD_MODEL_FILE_PARAM,
- "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) );
- // aggregateBuilder.add(SentenceDetectorAnnotatorBIO.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath()));
-
- // identify tokens
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) );
- // merge some tokens
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ContextDependentTokenizerAnnotator.class ) );
-
- // identify part-of-speech tags
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- POSTagger.class,
- TypeSystemDescriptionFactory.createTypeSystemDescription(),
- TypePrioritiesFactory.createTypePriorities( Segment.class, Sentence.class, BaseToken.class ),
- POSTagger.POS_MODEL_FILE_PARAM,
- "org/apache/ctakes/postagger/models/mayo-pos.zip" ) );
-
- // identify chunks
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- Chunker.class,
- Chunker.CHUNKER_MODEL_FILE_PARAM,
- FileLocator.locateFile( "org/apache/ctakes/chunker/models/chunker-model.zip" ),
- Chunker.CHUNKER_CREATOR_CLASS_PARAM,
- DefaultChunkCreator.class ) );
-
- // identify UMLS named entities
-
- // adjust NP in NP NP to span both
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- ChunkAdjuster.class,
- ChunkAdjuster.PARAM_CHUNK_PATTERN,
- new String[] { "NP", "NP" },
- ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
- 1 ) );
- // adjust NP in NP PP NP to span all three
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- ChunkAdjuster.class,
- ChunkAdjuster.PARAM_CHUNK_PATTERN,
- new String[] { "NP", "PP", "NP" },
- ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
- 2 ) );
- // add lookup windows for each NP
- aggregateBuilder
- .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
- // maximize lookup windows
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
- OverlapAnnotator.class,
- "A_ObjectClass",
- LookupWindowAnnotation.class,
- "B_ObjectClass",
- LookupWindowAnnotation.class,
- "OverlapType",
- "A_ENV_B",
- "ActionType",
- "DELETE",
- "DeleteAction",
- new String[] { "selector=B" } ) );
- // add UMLS on top of lookup windows
- aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() );
- aggregateBuilder.add( DefaultJCasTermAnnotator.createAnnotatorDescription() );
-
- // add dependency parser
- aggregateBuilder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
-
- // add semantic role labeler
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) );
-
- // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
- if ( this.treebankDirectory != null ) {
- aggregateBuilder.add( THYMETreebankReader.getDescription( this.treebankDirectory ) );
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) );
- } else {
- // add ctakes constituency parses to system view
- aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class,
- ConstituencyParser.PARAM_MODEL_FILENAME,
- "org/apache/ctakes/constituency/parser/models/thyme.bin" ) );
- }
+ aggregateBuilder.add( getGoldWritingAggregate() );
+ aggregateBuilder.add( getLinguisticProcessingDescription() );
+
// write out the CAS after all the above annotations
aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
XMIWriter.class,
@@ -576,6 +490,151 @@ org.cleartk.eval.Evaluation_ImplBase<Int
return aggregateBuilder;
}
+ protected AnalysisEngineDescription getGoldWritingAggregate() throws Exception {
+ return getGoldWritingAggregate(GOLD_VIEW_NAME);
+ }
+
+ protected AnalysisEngineDescription getGoldWritingAggregate(String goldViewName) throws Exception {
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ // read manual annotations into gold view
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewCreatorAnnotator.class,
+ ViewCreatorAnnotator.PARAM_VIEW_NAME,
+ goldViewName ) );
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewTextCopierAnnotator.class,
+ ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+ CAS.NAME_DEFAULT_SOFA,
+ ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+ goldViewName ) );
+ switch ( this.xmlFormat ) {
+ case AnaforaCoref:
+ case Anafora:
+ if(this.subcorpus == Subcorpus.DeepPhe){
+ aggregateBuilder.add(
+ AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class,
+ THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
+ this.xmlDirectory,
+ THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES,
+ new String[]{} ),
+ CAS.NAME_DEFAULT_SOFA,
+ goldViewName );
+ }else{
+ aggregateBuilder.add(
+ THYMEAnaforaXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ goldViewName );
+ }
+ break;
+ case Knowtator:
+ aggregateBuilder.add(
+ THYMEKnowtatorXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ goldViewName );
+ break;
+ case I2B2:
+ aggregateBuilder.add(
+ I2B2TemporalXMLReader.getDescription( this.xmlDirectory ),
+ CAS.NAME_DEFAULT_SOFA,
+ goldViewName );
+ break;
+ }
+ return aggregateBuilder.createAggregateDescription();
+ }
+
+ protected AnalysisEngineDescription getLinguisticProcessingDescription() throws Exception{
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ // identify segments
+ if(this.subcorpus == Subcorpus.DeepPhe){
+ aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PittHeaderAnnotator.class));
+ }else{
+ aggregateBuilder
+ .add( AnalysisEngineFactory.createEngineDescription( SegmentsFromBracketedSectionTagsAnnotator.class ) );
+ }
+ // identify sentences
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ SentenceDetector.class,
+ SentenceDetector.SD_MODEL_FILE_PARAM,
+ "org/apache/ctakes/core/sentdetect/sd-med-model.zip" ) );
+ // aggregateBuilder.add(SentenceDetectorAnnotatorBIO.getDescription(FileLocator.locateFile("org/apache/ctakes/core/sentdetect/model.jar").getPath()));
+
+ // identify tokens
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TokenizerAnnotatorPTB.class ) );
+ // merge some tokens
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ContextDependentTokenizerAnnotator.class ) );
+
+ // identify part-of-speech tags
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ POSTagger.class,
+ TypeSystemDescriptionFactory.createTypeSystemDescription(),
+ TypePrioritiesFactory.createTypePriorities( Segment.class, Sentence.class, BaseToken.class ),
+ POSTagger.POS_MODEL_FILE_PARAM,
+ "org/apache/ctakes/postagger/models/mayo-pos.zip" ) );
+
+ // identify chunks
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ Chunker.class,
+ Chunker.CHUNKER_MODEL_FILE_PARAM,
+ FileLocator.locateFile( "org/apache/ctakes/chunker/models/chunker-model.zip" ),
+ Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+ DefaultChunkCreator.class ) );
+
+ // identify UMLS named entities
+
+ // adjust NP in NP NP to span both
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ChunkAdjuster.class,
+ ChunkAdjuster.PARAM_CHUNK_PATTERN,
+ new String[] { "NP", "NP" },
+ ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+ 1 ) );
+ // adjust NP in NP PP NP to span all three
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ ChunkAdjuster.class,
+ ChunkAdjuster.PARAM_CHUNK_PATTERN,
+ new String[] { "NP", "PP", "NP" },
+ ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+ 2 ) );
+ // add lookup windows for each NP
+ aggregateBuilder
+ .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
+ // maximize lookup windows
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription(
+ OverlapAnnotator.class,
+ "A_ObjectClass",
+ LookupWindowAnnotation.class,
+ "B_ObjectClass",
+ LookupWindowAnnotation.class,
+ "OverlapType",
+ "A_ENV_B",
+ "ActionType",
+ "DELETE",
+ "DeleteAction",
+ new String[] { "selector=B" } ) );
+ // add UMLS on top of lookup windows
+ aggregateBuilder.add( LvgAnnotator.createAnnotatorDescription() );
+ aggregateBuilder.add( DefaultJCasTermAnnotator.createAnnotatorDescription() );
+
+ // add dependency parser
+ aggregateBuilder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+
+ // add semantic role labeler
+// aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) );
+
+ // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps
+ if ( this.treebankDirectory != null ) {
+ aggregateBuilder.add( THYMETreebankReader.getDescription( this.treebankDirectory ) );
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( TimexAnnotationCorrector.class ) );
+ } else {
+ // add ctakes constituency parses to system view
+ aggregateBuilder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class,
+ ConstituencyParser.PARAM_MODEL_FILENAME,
+ "org/apache/ctakes/constituency/parser/models/thyme.bin" ) );
+ }
+
+ return aggregateBuilder.createAggregateDescription();
+ }
+
public static <T extends Annotation> List<T> selectExact( JCas jCas, Class<T> annotationClass, Segment segment ) {
List<T> annotations = Lists.newArrayList();
for ( T annotation : JCasUtil.selectCovered( jCas, annotationClass, segment ) ) {
@@ -945,15 +1004,14 @@ org.cleartk.eval.Evaluation_ImplBase<Int
public void process( JCas jCas ) throws AnalysisEngineProcessException {
URI uri = ViewUriUtil.getURI( jCas );
String content;
-
try {
- content = CharStreams.toString( new InputStreamReader( uri.toURL().openStream() ) );
- content = content.replace( (char)0xc, ' ' );
- jCas.setSofaDataString( content, "text/plain" );
+ content = CharStreams.toString( new InputStreamReader( uri.toURL().openStream() ) );
+ content = content.replace( (char)0xc, ' ' );
+ jCas.setSofaDataString( content, "text/plain" );
} catch ( MalformedURLException e ) {
- throw new AnalysisEngineProcessException( e );
+ throw new AnalysisEngineProcessException( e );
} catch ( IOException e ) {
- throw new AnalysisEngineProcessException( e );
+ throw new AnalysisEngineProcessException( e );
}
}
}