You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/03/16 17:59:14 UTC
svn commit: r1667069 -
/ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
Author: seanfinan
Date: Mon Mar 16 16:59:14 2015
New Revision: 1667069
URL: http://svn.apache.org/r1667069
Log:
getFastPipeline doesn't need NP or LookupWindow information. Refactored accordingly.
Also cascaded the Note text in main(...) for easier reading.
Modified:
ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
Modified: ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java?rev=1667069&r1=1667068&r2=1667069&view=diff
==============================================================================
--- ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java (original)
+++ ctakes/trunk/ctakes-clinical-pipeline/src/main/java/org/apache/ctakes/clinicalpipeline/ClinicalPipelineFactory.java Mon Mar 16 16:59:14 2015
@@ -18,18 +18,7 @@
*/
package org.apache.ctakes.clinicalpipeline;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.ctakes.assertion.medfacts.cleartk.ConditionalCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.HistoryCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.*;
import org.apache.ctakes.chunker.ae.Chunker;
import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
@@ -47,12 +36,14 @@ import org.apache.ctakes.dictionary.look
import org.apache.ctakes.lvg.ae.LvgAnnotator;
import org.apache.ctakes.postagger.POSTagger;
import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
@@ -61,138 +52,211 @@ import org.apache.uima.fit.factory.JCasF
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.xml.sax.SAXException;
-public class ClinicalPipelineFactory {
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.*;
+
+final public class ClinicalPipelineFactory {
+
+ private ClinicalPipelineFactory() {
+ }
+
+ public static AnalysisEngineDescription getDefaultPipeline() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( getTokenProcessingPipeline() );
+ builder.add( getNpChunkerPipeline() );
+ builder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class ) );
+ builder.add( UmlsDictionaryLookupAnnotator.createAnnotatorDescription() );
+ builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+ builder.add( PolarityCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( UncertaintyCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( HistoryCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( ConditionalCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( GenericCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( SubjectCleartkAnalysisEngine.createAnnotatorDescription() );
+
+ return builder.createAggregateDescription();
+ }
+
+ public static AnalysisEngineDescription getFastPipeline() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( getTokenProcessingPipeline() );
+ try {
+ builder.add( AnalysisEngineFactory.createEngineDescription( DefaultJCasTermAnnotator.class,
+ AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
+ "org.apache.ctakes.typesystem.type.textspan.Sentence",
+ JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+ ExternalResourceFactory.createExternalResourceDescription(
+ FileResourceImpl.class,
+ FileLocator.locateFile( "org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml" ) )
+ ) );
+ } catch ( FileNotFoundException e ) {
+ e.printStackTrace();
+ throw new ResourceInitializationException( e );
+ }
+ builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+ builder.add( PolarityCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( UncertaintyCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( HistoryCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( ConditionalCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( GenericCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( SubjectCleartkAnalysisEngine.createAnnotatorDescription() );
+ return builder.createAggregateDescription();
+ }
+
+ public static AnalysisEngineDescription getParsingPipeline() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( getTokenProcessingPipeline() );
+ builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+ builder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class ) );
+ return builder.createAggregateDescription();
+ }
+
+ public static AnalysisEngineDescription getTokenProcessingPipeline() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( SimpleSegmentAnnotator.createAnnotatorDescription() );
+ builder.add( SentenceDetector.createAnnotatorDescription() );
+ builder.add( TokenizerAnnotatorPTB.createAnnotatorDescription() );
+ builder.add( LvgAnnotator.createAnnotatorDescription() );
+ builder.add( ContextDependentTokenizerAnnotator.createAnnotatorDescription() );
+ builder.add( POSTagger.createAnnotatorDescription() );
+ return builder.createAggregateDescription();
+ }
+
+ public static AnalysisEngineDescription getNpChunkerPipeline() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( Chunker.createAnnotatorDescription() );
+ builder.add( getStandardChunkAdjusterAnnotator() );
+ builder.add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription( RemoveEnclosedLookupWindows.class ) );
+ return builder.createAggregateDescription();
+ }
+
+ public static AnalysisEngineDescription getStandardChunkAdjusterAnnotator() throws ResourceInitializationException {
+ AggregateBuilder builder = new AggregateBuilder();
+ // adjust NP in NP NP to span both
+ builder.add( ChunkAdjuster.createAnnotatorDescription( new String[] { "NP", "NP" }, 1 ) );
+ // adjust NP in NP PP NP to span all three
+ builder.add( ChunkAdjuster.createAnnotatorDescription( new String[] { "NP", "PP", "NP" }, 2 ) );
+ return builder.createAggregateDescription();
+ }
+
+ public static void main( final String... args ) throws IOException, UIMAException, SAXException {
+ // The note is easier to read when sentences are stacked - changed 3/16/2015 spf
+ // Two sentences had no space after the period
+ // Introduction of a space before "Discussed" actually changed the uncertainty of "surgery" from true to false
+ final String note = "History of diabetes and hypertension."
+ + " Mother had breast cancer."
+ + " Sister with multiple sclerosis."
+ + " The patient is suffering from extreme pain due to shark bite."
+ + " Recommend continuing use of aspirin, oxycodone, and coumadin."
+ + " Continue exercise for obesity and hypertension."
+ + " Patient denies smoking and chest pain." // Space between sentences introduced " Patient"
+ + " Patient has no cancer."
+ + " There is no sign of multiple sclerosis."
+ + " Mass is suspicious for breast cancer."
+ + " Possible breast cancer."
+ + " Cannot exclude stenosis."
+ + " Some degree of focal pancreatitis is also possible."
+ + " Discussed surgery and chemotherapy." // Space between sentences introduced " Discussed"
+ + " Will return if pain continues.";
+ final JCas jcas = JCasFactory.createJCas();
+ jcas.setDocumentText( note );
+ final AnalysisEngineDescription aed = getDefaultPipeline();
+// final AnalysisEngineDescription aed = getFastPipeline(); // Outputs from default and fast pipelines are identical
+ SimplePipeline.runPipeline( jcas, aed );
+
+ final boolean printCuis = Arrays.asList( args ).contains( "cuis" );
+ final Collection<String> codes = new ArrayList<>();
+ for ( IdentifiedAnnotation entity : JCasUtil.select( jcas, IdentifiedAnnotation.class ) ) {
+
+ System.out.println( "Entity: " + entity.getCoveredText()
+ + " === Polarity: " + entity.getPolarity()
+ + " === Uncertain? " + (entity.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT)
+ + " === Subject: " + entity.getSubject()
+ + " === Generic? " + (entity.getGeneric() == CONST.NE_GENERIC_TRUE)
+ + " === Conditional? " + (entity.getConditional() == CONST.NE_CONDITIONAL_TRUE)
+ + " === History? " + (entity.getHistoryOf() == CONST.NE_HISTORY_OF_PRESENT)
+ );
+
+ if ( printCuis ) {
+ codes.clear();
+ codes.addAll( getCUIs( entity ) );
+ for ( String cui : codes ) {
+ System.out.print( cui + " " );
+ }
+ System.out.println();
+ }
+
+ }
+ if ( args.length > 0 ) {
+ aed.toXML( new FileWriter( args[ 0 ] ) );
+ }
+ }
+
+
+ /**
+ * @param identifiedAnnotation -
+ * @return list of all cuis
+ */
+ static private Collection<String> getCUIs( final IdentifiedAnnotation identifiedAnnotation ) {
+ final FSArray fsArray = identifiedAnnotation.getOntologyConceptArr();
+ if ( fsArray == null ) {
+ return Collections.emptySet();
+ }
+ final FeatureStructure[] featureStructures = fsArray.toArray();
+ final Collection<String> cuis = new ArrayList<>( featureStructures.length );
+ for ( FeatureStructure featureStructure : featureStructures ) {
+ if ( featureStructure instanceof UmlsConcept ) {
+ final UmlsConcept umlsConcept = (UmlsConcept)featureStructure;
+ final String cui = umlsConcept.getCui();
+ final String tui = umlsConcept.getTui();
+ if ( tui != null && !tui.isEmpty() ) {
+ cuis.add( cui + "_" + tui );
+ } else {
+ cuis.add( cui );
+ }
+ }
+ }
+ return cuis;
+ }
+
+ public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ for ( Chunk chunk : JCasUtil.select( jCas, Chunk.class ) ) {
+ if ( chunk.getChunkType().equals( "NP" ) ) {
+ new LookupWindowAnnotation( jCas, chunk.getBegin(), chunk.getEnd() ).addToIndexes();
+ }
+ }
+ }
+ }
+
+ public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ List<LookupWindowAnnotation> lws = new ArrayList<>( JCasUtil.select( jCas, LookupWindowAnnotation.class ) );
+ // we'll navigate backwards so that as we delete things we shorten the list from the back
+ for ( int i = lws.size() - 2; i >= 0; i-- ) {
+ LookupWindowAnnotation lw1 = lws.get( i );
+ LookupWindowAnnotation lw2 = lws.get( i + 1 );
+ if ( lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd() ) {
+ /// lw1 envelops or encloses lw2
+ lws.remove( i + 1 );
+ lw2.removeFromIndexes();
+ }
+ }
+
+ }
+
+ }
+
- public static AnalysisEngineDescription getDefaultPipeline() throws ResourceInitializationException{
- AggregateBuilder builder = new AggregateBuilder();
- builder.add(getTokenProcessingPipeline());
- builder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
- builder.add(AnalysisEngineFactory.createEngineDescription(RemoveEnclosedLookupWindows.class));
- builder.add(AnalysisEngineFactory.createEngineDescription(ConstituencyParser.class));
- builder.add(UmlsDictionaryLookupAnnotator.createAnnotatorDescription());
- builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
- builder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(ConditionalCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
-
- return builder.createAggregateDescription();
- }
-
- public static AnalysisEngineDescription getFastPipeline() throws ResourceInitializationException{
- AggregateBuilder builder = new AggregateBuilder();
- builder.add(getTokenProcessingPipeline());
- builder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
- builder.add(AnalysisEngineFactory.createEngineDescription(RemoveEnclosedLookupWindows.class));
- try {
- builder.add(AnalysisEngineFactory.createEngineDescription(DefaultJCasTermAnnotator.class,
- AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
- "org.apache.ctakes.typesystem.type.textspan.Sentence",
- JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
- ExternalResourceFactory.createExternalResourceDescription(
- FileResourceImpl.class,
- FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml"))
- ));
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- throw new ResourceInitializationException(e);
- }
- builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
- builder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
- builder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
- return builder.createAggregateDescription();
- }
-
- public static AnalysisEngineDescription getParsingPipeline() throws ResourceInitializationException{
- AggregateBuilder builder = new AggregateBuilder();
- builder.add(getTokenProcessingPipeline());
- builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
- builder.add(AnalysisEngineFactory.createEngineDescription(ConstituencyParser.class));
- return builder.createAggregateDescription();
- }
-
- public static AnalysisEngineDescription getTokenProcessingPipeline() throws ResourceInitializationException {
- AggregateBuilder builder = new AggregateBuilder();
- builder.add(SimpleSegmentAnnotator.createAnnotatorDescription());
- builder.add(SentenceDetector.createAnnotatorDescription());
- builder.add(TokenizerAnnotatorPTB.createAnnotatorDescription());
- builder.add(LvgAnnotator.createAnnotatorDescription());
- builder.add(ContextDependentTokenizerAnnotator.createAnnotatorDescription());
- builder.add(POSTagger.createAnnotatorDescription());
- builder.add(Chunker.createAnnotatorDescription());
- builder.add(getStandardChunkAdjusterAnnotator());
-
- return builder.createAggregateDescription();
- }
-
- public static AnalysisEngineDescription getStandardChunkAdjusterAnnotator() throws ResourceInitializationException{
- AggregateBuilder builder = new AggregateBuilder();
- // adjust NP in NP NP to span both
- builder.add(ChunkAdjuster.createAnnotatorDescription(new String[] { "NP", "NP" }, 1));
- // adjust NP in NP PP NP to span all three
- builder.add(ChunkAdjuster.createAnnotatorDescription(new String[] { "NP", "PP", "NP" }, 2));
- return builder.createAggregateDescription();
- }
-
- public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
- if (chunk.getChunkType().equals("NP")) {
- new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
- }
- }
- }
- }
-
- public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase {
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- List<LookupWindowAnnotation> lws = new ArrayList<>(JCasUtil.select(jCas, LookupWindowAnnotation.class));
- // we'll navigate backwards so that as we delete things we shorten the list from the back
- for(int i = lws.size()-2; i >= 0; i--){
- LookupWindowAnnotation lw1 = lws.get(i);
- LookupWindowAnnotation lw2 = lws.get(i+1);
- if(lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()){
- /// lw1 envelops or encloses lw2
- lws.remove(i+1);
- lw2.removeFromIndexes();
- }
- }
-
- }
-
- }
-
- public static void main(String[] args) throws FileNotFoundException, IOException, UIMAException, SAXException{
- AnalysisEngineDescription aed = getDefaultPipeline();
- String note = "History of diabetes and hypertension. Mother had breast cancer. Sister with multiple sclerosis. " +
- "The patient is suffering from extreme pain due to shark bite. Recommend continuing use of aspirin, oxycodone, and coumadin. Continue exercise for obesity and hypertension." +
- "Patient denies smoking and chest pain. Patient has no cancer. There is no sign of multiple sclerosis. " +
- "Mass is suspicious for breast cancer. Possible breast cancer. Cannot exclude stenosis. Some degree of focal pancreatitis is also possible." +
- "Discussed surgery and chemotherapy. Will return if pain continues. ";
- JCas jcas = JCasFactory.createJCas();
- jcas.setDocumentText(note);
- SimplePipeline.runPipeline(jcas, aed);
-
- for(IdentifiedAnnotation entity : JCasUtil.select(jcas, IdentifiedAnnotation.class)){
- System.out.println("Entity: " + entity.getCoveredText() + " === Polarity: " + entity.getPolarity() +
- " === Uncertain? " + (entity.getUncertainty()==CONST.NE_UNCERTAINTY_PRESENT) +
- " === Subject: " + entity.getSubject() +
- " === Generic? " + (entity.getGeneric() == CONST.NE_GENERIC_TRUE) +
- " === Conditional? " + (entity.getConditional() == CONST.NE_CONDITIONAL_TRUE) +
- " === History? " + (entity.getHistoryOf() == CONST.NE_HISTORY_OF_PRESENT)
- );
- }
-
- if(args.length > 0)
- aed.toXML(new FileWriter(args[0]));
- }
-}
+}
\ No newline at end of file