You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/07/11 03:08:22 UTC
svn commit: r1690329 - in /ctakes/sandbox/ctakes-allergy: ./ src/ src/main/
src/main/java/ src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/allergy/
src/main/java/org/apache/ctakes/allergy/ae...
Author: seanfinan
Date: Sat Jul 11 01:08:21 2015
New Revision: 1690329
URL: http://svn.apache.org/r1690329
Log:
Checkin of simple regex allergy span detector
Added:
ctakes/sandbox/ctakes-allergy/
ctakes/sandbox/ctakes-allergy/pom.xml
ctakes/sandbox/ctakes-allergy/src/
ctakes/sandbox/ctakes-allergy/src/main/
ctakes/sandbox/ctakes-allergy/src/main/java/
ctakes/sandbox/ctakes-allergy/src/main/java/org/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java
ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/util/
Added: ctakes/sandbox/ctakes-allergy/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/pom.xml?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/pom.xml (added)
+++ ctakes/sandbox/ctakes-allergy/pom.xml Sat Jul 11 01:08:21 2015
@@ -0,0 +1,113 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes</artifactId>
+ <version>3.2.3-SNAPSHOT</version>
+ </parent>
+ <artifactId>ctakes-allergy</artifactId>
+ <name>ctakes-allergy</name>
+ <description>Allergy Identification Prototype</description>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-clinical-pipeline</artifactId>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.uima</groupId>
+ <artifactId>jcasgen-maven-plugin</artifactId>
+ <version>2.5.0</version>
+ <executions>
+ <execution>
+ <goals><goal>generate</goal></goals>
+ <configuration>
+ <typeSystemIncludes>
+ <typeSystemInclude>src/main/resources/org/apache/ctakes/**/types/TypeSystem.xml</typeSystemInclude>
+ </typeSystemIncludes>
+ <limitToProject>false</limitToProject>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+ <profiles>
+ <profile>
+ <id>runAllergyCVD</id>
+ <activation>
+ <property>
+ <name>runAllergyCVD</name>
+ </property>
+ </activation>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <version>1.2.1</version>
+ <executions>
+ <execution>
+ <!-- depends on other modules being on classpath -->
+ <phase>compile</phase>
+ <goals>
+ <goal>java</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <includeProjectDependencies>true</includeProjectDependencies>
+ <includePluginDependencies>true</includePluginDependencies>
+ <mainClass>org.apache.uima.tools.cvd.CVD</mainClass>
+ <!-- Have to specify at least one parameter otherwise, CVD thinks
+ it's an invalid param because MVN passes null when joining to the mvn thread -->
+ <arguments>
+ <argument>-lookandfeel</argument>
+ <argument>javax.swing.plaf.metal.MetalLookAndFeel</argument>
+ </arguments>
+ </configuration>
+ <dependencies>
+ <!-- System lib dependencies is causing a world of pain. Remove these
+ when they're uploaded to Maven Central -->
+ <dependency>
+ <groupId>org.mitre.medfacts</groupId>
+ <artifactId>medfacts-i2b2</artifactId>
+ <version>1.2</version>
+ <scope>system</scope>
+ <systemPath>${project.basedir}/../ctakes-assertion/lib/med-facts-i2b2-1.2-SNAPSHOT.jar</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>org.mitre.medfacts</groupId>
+ <artifactId>medfacts-zoner</artifactId>
+ <version>1.1</version>
+ <scope>system</scope>
+ <systemPath>${project.basedir}/../ctakes-assertion/lib/med-facts-zoner-1.1.jar</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>org.mitre.jcarafe.core</groupId>
+ <artifactId>jcarafe.core</artifactId>
+ <version>2.9.1</version>
+ <scope>system</scope>
+ <systemPath>${project.basedir}/../ctakes-assertion/lib/jcarafe-core_2.9.1-0.9.8.3.RC4.jar</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>org.mitre.jcarafe.ext</groupId>
+ <artifactId>jcarafe.ext</artifactId>
+ <version>2.9.1</version>
+ <scope>system</scope>
+ <systemPath>${project.basedir}/../ctakes-assertion/lib/jcarafe-ext_2.9.1-0.9.8.3.RC4.jar</systemPath>
+ </dependency>
+ <dependency>
+ <groupId>gov.nih.nlm.nls.lvg</groupId>
+ <artifactId>lvg2010dist</artifactId>
+ <version>0.0.1</version>
+ </dependency>
+ </dependencies>
+ </plugin>
+ </plugins>
+ </build>
+ </profile>
+ </profiles>
+</project>
\ No newline at end of file
Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java (added)
+++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java Sat Jul 11 01:08:21 2015
@@ -0,0 +1,116 @@
+package org.apache.ctakes.allergy.ae;
+
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/10/2015
+ */
+final public class AllergyAnnotator extends JCasAnnotator_ImplBase {
+
+ static private final Logger LOGGER = Logger.getLogger( "AllergyAnnotator" );
+
+
+ static private enum AllergyPreExpression {
+ COLON_LIST( "\\ballergies:\\s++[a-z,'\"\\t ]*" ),
+ ALLERGIC_TO( "\\ballergic( reaction)? to:?\\s++[a-z\\,'\"\\t ]*" );
+ final private Pattern __pattern;
+ private AllergyPreExpression( final String regex ) {
+ __pattern = Pattern.compile( regex );
+ }
+ private Matcher getMatcher( final CharSequence windowText ) {
+ return __pattern.matcher( windowText );
+ }
+ }
+
+ static private enum AllergyPostExpression {
+ ALLERGY( "[a-z]* allergy"),
+ HYPERSENSITIVITY( "[a-z]* (hyper)?sensitivity");
+ final private Pattern __pattern;
+ private AllergyPostExpression( final String regex ) {
+ __pattern = Pattern.compile( regex );
+ }
+ private Matcher getMatcher( final CharSequence windowText ) {
+ return __pattern.matcher( windowText );
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+ LOGGER.info( "Starting processing" );
+
+ final String docText = jcas.getDocumentText();
+ final Collection<MedicationMention> medications = JCasUtil.select( jcas, MedicationMention.class );
+
+ for ( MedicationMention medication : medications ) {
+ final int windowBegin = Math.max( 0, medication.getBegin() - 40 );
+ final String preWindowText = docText.substring( windowBegin, medication.getEnd() ).toLowerCase();
+ for ( AllergyPreExpression preExpression : AllergyPreExpression.values() ) {
+ final Matcher matcher = preExpression.getMatcher( preWindowText );
+ while ( matcher.find() ) {
+ storeAllergy( jcas, windowBegin + matcher.start(), medication.getEnd() );
+ // could break from loop but there may be a wider context
+ }
+ }
+ final int windowEnd = Math.min( docText.length(), medication.getEnd() + 20 );
+ final String postWindowText = docText.substring( medication.getBegin(), windowEnd ).toLowerCase();
+ for ( AllergyPostExpression postExpression : AllergyPostExpression.values() ) {
+ final Matcher matcher = postExpression.getMatcher( postWindowText );
+ while ( matcher.find() ) {
+ storeAllergy( jcas, medication.getBegin(), windowBegin + matcher.end() );
+ // could break from loop but there may be a wider context
+ }
+ }
+ }
+ LOGGER.info( "Finished processing" );
+ }
+
+
+ static private void storeAllergy( final JCas jcas, final int matchBegin, final int matchEnd ) {
+ final UmlsConcept umlsConcept = new UmlsConcept( jcas );
+ umlsConcept.setCodingScheme( "AllergyPrototype" );
+ // C0020517 is a generic CUI for hypersensitivity / allergy
+ umlsConcept.setCui( "C0020517" );
+ umlsConcept.setTui( "T046" );
+ umlsConcept.setPreferredText( "Hypersensitivity" );
+ final FSArray conceptArr = new FSArray( jcas, 1 );
+ conceptArr.set( 0, umlsConcept );
+
+ final IdentifiedAnnotation annotation = new SignSymptomMention( jcas );
+ annotation.setTypeID( CONST.NE_TYPE_ID_FINDING );
+ annotation.setBegin( matchBegin );
+ annotation.setEnd( matchEnd );
+ annotation.setOntologyConceptArr( conceptArr );
+// annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+ annotation.addToIndexes();
+ }
+
+
+
+ static public AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( AllergyAnnotator.class );
+ }
+
+}
Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java (added)
+++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java Sat Jul 11 01:08:21 2015
@@ -0,0 +1,155 @@
+package org.apache.ctakes.allergy.pipeline;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+import org.apache.ctakes.allergy.ae.AllergyAnnotator;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.CopyNPChunksToLookupWindowAnnotations;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.RemoveEnclosedLookupWindows;
+import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.InvalidXMLException;
+
+import javax.annotation.concurrent.Immutable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+
+@Immutable
+final public class AllergyPipelineRunner {
+ private AllergyPipelineRunner() {
+ }
+
+ static interface AllergyPipelineOptions {
+ @Option(
+ shortName = "i",
+ description = "specify the path to the directory containing the clinical notes to be processed" )
+ public String getInputDirectory();
+
+ @Option(
+ shortName = "o",
+ description = "specify the path to the directory where the output xmi files are to be saved" )
+ public String getOutputDirectory();
+ }
+
+ static private final String CTAKES_DIR_PREFIX = "/org/apache/ctakes/";
+
+ public static AnalysisEngineDescription getPipelineDescription()
+ throws ResourceInitializationException, InvalidXMLException, IOException {
+ return getPipelineDescription( "" );
+ }
+
+ public static AnalysisEngineDescription getPipelineDescription( final AllergyPipelineOptions options )
+ throws ResourceInitializationException, InvalidXMLException, IOException {
+ return getPipelineDescription( "" );
+ }
+
+ public static AnalysisEngineDescription getPipelineDescription( final String outputDirectory )
+ throws ResourceInitializationException, InvalidXMLException, IOException {
+ final AggregateBuilder builder = new AggregateBuilder();
+ // core components, dictionary, dependency parser, polarity, uncertainty
+ builder.add( SimpleSegmentAnnotator.createAnnotatorDescription() );
+ builder.add( SentenceDetector.createAnnotatorDescription() );
+ builder.add( TokenizerAnnotatorPTB.createAnnotatorDescription() );
+ builder.add( LvgAnnotator.createAnnotatorDescription() );
+ builder.add( ContextDependentTokenizerAnnotator.createAnnotatorDescription() );
+ builder.add( POSTagger.createAnnotatorDescription() );
+ builder.add( Chunker.createAnnotatorDescription() );
+ builder.add( ClinicalPipelineFactory.getStandardChunkAdjusterAnnotator() );
+
+ builder
+ .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription( RemoveEnclosedLookupWindows.class ) );
+ try {
+ builder.add( AnalysisEngineFactory.createEngineDescription( DefaultJCasTermAnnotator.class,
+ JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+ ExternalResourceFactory.createExternalResourceDescription(
+ FileResourceImpl.class,
+ FileLocator.locateFile( "org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml" ) )
+ ) );
+ } catch ( FileNotFoundException e ) {
+ e.printStackTrace();
+ throw new ResourceInitializationException( e );
+ }
+
+ builder.add( AllergyAnnotator.createAnnotatorDescription() );
+
+ builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+ builder.add( PolarityCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( UncertaintyCleartkAnalysisEngine.createAnnotatorDescription() );
+ builder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class ) );
+
+ return builder.createAggregateDescription();
+ }
+
+ private static CollectionReader createFilesInDirectoryReader( final String inputDirectory ) throws UIMAException,
+ IOException {
+ final String descriptorPath
+ = FileLocator.getFullPath( "ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml" );
+ return CollectionReaderFactory.createReaderFromPath( descriptorPath,
+ FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+ inputDirectory );
+
+ }
+
+ private static AnalysisEngine createXMIWriter( final String outputDirectory )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngine( XmiWriterCasConsumerCtakes.class,
+ XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR,
+ outputDirectory );
+ }
+
+ public static void runAllergyPipeline( final String inputDirectory,
+ final String outputDirectory ) throws UIMAException, IOException {
+ final CollectionReader collectionReader = createFilesInDirectoryReader( inputDirectory );
+ final AnalysisEngineDescription analysisEngineDescription = getPipelineDescription( outputDirectory );
+ final AnalysisEngine xmiWriter = createXMIWriter( outputDirectory );
+ runAllergyPipeline( collectionReader, analysisEngineDescription, xmiWriter );
+ }
+
+ public static void runAllergyPipeline( final CollectionReader collectionReader,
+ final AnalysisEngineDescription analysisEngineDescription,
+ final AnalysisEngine outputWriter ) throws UIMAException, IOException {
+ SimplePipeline.runPipeline( collectionReader,
+ AnalysisEngineFactory.createEngine( analysisEngineDescription ),
+ outputWriter );
+ }
+
+ static private String getStandardModelPath( final String moduleDirectory ) {
+ return CTAKES_DIR_PREFIX + moduleDirectory + "/model.jar";
+ }
+
+
+ public static void main( final String... args ) throws UIMAException, IOException {
+ final AllergyPipelineOptions options = CliFactory.parseArguments( AllergyPipelineOptions.class, args );
+ runAllergyPipeline( options.getInputDirectory(), options.getOutputDirectory() );
+ }
+
+}