You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/07/11 03:08:22 UTC

svn commit: r1690329 - in /ctakes/sandbox/ctakes-allergy: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/allergy/ src/main/java/org/apache/ctakes/allergy/ae...

Author: seanfinan
Date: Sat Jul 11 01:08:21 2015
New Revision: 1690329

URL: http://svn.apache.org/r1690329
Log:
Checkin of simple regex allergy span detector

Added:
    ctakes/sandbox/ctakes-allergy/
    ctakes/sandbox/ctakes-allergy/pom.xml
    ctakes/sandbox/ctakes-allergy/src/
    ctakes/sandbox/ctakes-allergy/src/main/
    ctakes/sandbox/ctakes-allergy/src/main/java/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java
    ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/util/

Added: ctakes/sandbox/ctakes-allergy/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/pom.xml?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/pom.xml (added)
+++ ctakes/sandbox/ctakes-allergy/pom.xml Sat Jul 11 01:08:21 2015
@@ -0,0 +1,113 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.ctakes</groupId>
+    <artifactId>ctakes</artifactId>
+    <version>3.2.3-SNAPSHOT</version>
+  </parent>
+  <artifactId>ctakes-allergy</artifactId>
+  <name>ctakes-allergy</name>
+  <description>Allergy Identification Prototype</description>
+  	<dependencies>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-clinical-pipeline</artifactId>
+		</dependency>
+	</dependencies>
+   <build>
+      <plugins>
+         <plugin>
+            <groupId>org.apache.uima</groupId>
+            <artifactId>jcasgen-maven-plugin</artifactId>
+            <version>2.5.0</version>
+            <executions>
+               <execution>
+                  <goals><goal>generate</goal></goals>
+                  <configuration>
+                     <typeSystemIncludes>
+                        <typeSystemInclude>src/main/resources/org/apache/ctakes/**/types/TypeSystem.xml</typeSystemInclude>
+                     </typeSystemIncludes>
+                     <limitToProject>false</limitToProject>
+                  </configuration>
+               </execution>
+            </executions>
+         </plugin>
+      </plugins>
+   </build>
+   <profiles>
+      <profile>
+         <id>runAllergyCVD</id>
+         <activation>
+            <property>
+               <name>runAllergyCVD</name>
+            </property>
+         </activation>
+         <build>
+            <plugins>
+               <plugin>
+                  <groupId>org.codehaus.mojo</groupId>
+                  <artifactId>exec-maven-plugin</artifactId>
+                  <version>1.2.1</version>
+                  <executions>
+                     <execution>
+                        <!-- depends on other modules being on classpath -->
+                        <phase>compile</phase>
+                        <goals>
+                           <goal>java</goal>
+                        </goals>
+                     </execution>
+                  </executions>
+                  <configuration>
+                     <includeProjectDependencies>true</includeProjectDependencies>
+                     <includePluginDependencies>true</includePluginDependencies>
+                     <mainClass>org.apache.uima.tools.cvd.CVD</mainClass>
+                     <!-- Have to specify at least one parameter otherwise, CVD thinks
+                        it's an invalid param because MVN passes null when joining to the mvn thread -->
+                     <arguments>
+                        <argument>-lookandfeel</argument>
+                        <argument>javax.swing.plaf.metal.MetalLookAndFeel</argument>
+                     </arguments>
+                  </configuration>
+                  <dependencies>
+                     <!-- System lib dependencies is causing a world of pain. Remove these
+                        when they're uploaded to Maven Central -->
+                     <dependency>
+                        <groupId>org.mitre.medfacts</groupId>
+                        <artifactId>medfacts-i2b2</artifactId>
+                        <version>1.2</version>
+                        <scope>system</scope>
+                        <systemPath>${project.basedir}/../ctakes-assertion/lib/med-facts-i2b2-1.2-SNAPSHOT.jar</systemPath>
+                     </dependency>
+                     <dependency>
+                        <groupId>org.mitre.medfacts</groupId>
+                        <artifactId>medfacts-zoner</artifactId>
+                        <version>1.1</version>
+                        <scope>system</scope>
+                        <systemPath>${project.basedir}/../ctakes-assertion/lib/med-facts-zoner-1.1.jar</systemPath>
+                     </dependency>
+                     <dependency>
+                        <groupId>org.mitre.jcarafe.core</groupId>
+                        <artifactId>jcarafe.core</artifactId>
+                        <version>2.9.1</version>
+                        <scope>system</scope>
+                        <systemPath>${project.basedir}/../ctakes-assertion/lib/jcarafe-core_2.9.1-0.9.8.3.RC4.jar</systemPath>
+                     </dependency>
+                     <dependency>
+                        <groupId>org.mitre.jcarafe.ext</groupId>
+                        <artifactId>jcarafe.ext</artifactId>
+                        <version>2.9.1</version>
+                        <scope>system</scope>
+                        <systemPath>${project.basedir}/../ctakes-assertion/lib/jcarafe-ext_2.9.1-0.9.8.3.RC4.jar</systemPath>
+                     </dependency>
+                     <dependency>
+                        <groupId>gov.nih.nlm.nls.lvg</groupId>
+                        <artifactId>lvg2010dist</artifactId>
+                        <version>0.0.1</version>
+                     </dependency>
+                  </dependencies>
+               </plugin>
+            </plugins>
+         </build>
+      </profile>
+   </profiles>
+</project>
\ No newline at end of file

Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java (added)
+++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java Sat Jul 11 01:08:21 2015
@@ -0,0 +1,116 @@
+package org.apache.ctakes.allergy.ae;
+
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.util.Collection;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/10/2015
+ */
+final public class AllergyAnnotator extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "AllergyAnnotator" );
+
+
+   static private enum AllergyPreExpression {
+      COLON_LIST( "\\ballergies:\\s++[a-z,'\"\\t ]*" ),
+      ALLERGIC_TO( "\\ballergic( reaction)? to:?\\s++[a-z\\,'\"\\t ]*" );
+      final private Pattern __pattern;
+      private AllergyPreExpression( final String regex ) {
+         __pattern = Pattern.compile( regex );
+      }
+      private Matcher getMatcher( final CharSequence windowText ) {
+         return __pattern.matcher( windowText );
+      }
+   }
+
+   static private enum AllergyPostExpression {
+      ALLERGY( "[a-z]* allergy"),
+      HYPERSENSITIVITY( "[a-z]* (hyper)?sensitivity");
+      final private Pattern __pattern;
+      private AllergyPostExpression( final String regex ) {
+         __pattern = Pattern.compile( regex );
+      }
+      private Matcher getMatcher( final CharSequence windowText ) {
+         return __pattern.matcher( windowText );
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Starting processing" );
+
+      final String docText = jcas.getDocumentText();
+      final Collection<MedicationMention> medications = JCasUtil.select( jcas, MedicationMention.class );
+
+      for ( MedicationMention medication : medications ) {
+         final int windowBegin = Math.max( 0, medication.getBegin() - 40 );
+         final String preWindowText = docText.substring( windowBegin, medication.getEnd() ).toLowerCase();
+         for ( AllergyPreExpression preExpression : AllergyPreExpression.values() ) {
+            final Matcher matcher = preExpression.getMatcher( preWindowText );
+            while ( matcher.find() ) {
+               storeAllergy( jcas, windowBegin + matcher.start(), medication.getEnd() );
+               // could break from loop but there may be a wider context
+            }
+         }
+         final int windowEnd = Math.min( docText.length(), medication.getEnd() + 20 );
+         final String postWindowText = docText.substring( medication.getBegin(), windowEnd ).toLowerCase();
+         for ( AllergyPostExpression postExpression : AllergyPostExpression.values() ) {
+            final Matcher matcher = postExpression.getMatcher( postWindowText );
+            while ( matcher.find() ) {
+               storeAllergy( jcas, medication.getBegin(), windowBegin + matcher.end() );
+               // could break from loop but there may be a wider context
+            }
+         }
+      }
+      LOGGER.info( "Finished processing" );
+   }
+
+
+   static private void storeAllergy( final JCas jcas, final int matchBegin, final int matchEnd ) {
+      final UmlsConcept umlsConcept = new UmlsConcept( jcas );
+      umlsConcept.setCodingScheme( "AllergyPrototype" );
+      // C0020517 is a generic CUI for hypersensitivity / allergy
+      umlsConcept.setCui( "C0020517" );
+      umlsConcept.setTui( "T046" );
+      umlsConcept.setPreferredText( "Hypersensitivity" );
+      final FSArray conceptArr = new FSArray( jcas, 1 );
+      conceptArr.set( 0, umlsConcept );
+
+      final IdentifiedAnnotation annotation = new SignSymptomMention( jcas );
+      annotation.setTypeID( CONST.NE_TYPE_ID_FINDING );
+      annotation.setBegin( matchBegin );
+      annotation.setEnd( matchEnd );
+      annotation.setOntologyConceptArr( conceptArr );
+//            annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+      annotation.addToIndexes();
+   }
+
+
+
+   static public AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( AllergyAnnotator.class );
+   }
+
+}

Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java?rev=1690329&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java (added)
+++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java Sat Jul 11 01:08:21 2015
@@ -0,0 +1,155 @@
+package org.apache.ctakes.allergy.pipeline;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+import org.apache.ctakes.allergy.ae.AllergyAnnotator;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.CopyNPChunksToLookupWindowAnnotations;
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.RemoveEnclosedLookupWindows;
+import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.InvalidXMLException;
+
+import javax.annotation.concurrent.Immutable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+
+@Immutable
+final public class AllergyPipelineRunner {
+   private AllergyPipelineRunner() {
+   }
+
+   static interface AllergyPipelineOptions {
+      @Option(
+            shortName = "i",
+            description = "specify the path to the directory containing the clinical notes to be processed" )
+      public String getInputDirectory();
+
+      @Option(
+            shortName = "o",
+            description = "specify the path to the directory where the output xmi files are to be saved" )
+      public String getOutputDirectory();
+   }
+
+   static private final String CTAKES_DIR_PREFIX = "/org/apache/ctakes/";
+
+   public static AnalysisEngineDescription getPipelineDescription()
+         throws ResourceInitializationException, InvalidXMLException, IOException {
+      return getPipelineDescription( "" );
+   }
+
+   public static AnalysisEngineDescription getPipelineDescription( final AllergyPipelineOptions options )
+         throws ResourceInitializationException, InvalidXMLException, IOException {
+      return getPipelineDescription( "" );
+   }
+
+   public static AnalysisEngineDescription getPipelineDescription( final String outputDirectory )
+         throws ResourceInitializationException, InvalidXMLException, IOException {
+      final AggregateBuilder builder = new AggregateBuilder();
+      // core components, dictionary, dependency parser, polarity, uncertainty
+      builder.add( SimpleSegmentAnnotator.createAnnotatorDescription() );
+      builder.add( SentenceDetector.createAnnotatorDescription() );
+      builder.add( TokenizerAnnotatorPTB.createAnnotatorDescription() );
+      builder.add( LvgAnnotator.createAnnotatorDescription() );
+      builder.add( ContextDependentTokenizerAnnotator.createAnnotatorDescription() );
+      builder.add( POSTagger.createAnnotatorDescription() );
+      builder.add( Chunker.createAnnotatorDescription() );
+      builder.add( ClinicalPipelineFactory.getStandardChunkAdjusterAnnotator() );
+
+      builder
+            .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) );
+      builder.add( AnalysisEngineFactory.createEngineDescription( RemoveEnclosedLookupWindows.class ) );
+      try {
+         builder.add( AnalysisEngineFactory.createEngineDescription( DefaultJCasTermAnnotator.class,
+               JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+               ExternalResourceFactory.createExternalResourceDescription(
+                     FileResourceImpl.class,
+                     FileLocator.locateFile( "org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml" ) )
+         ) );
+      } catch ( FileNotFoundException e ) {
+         e.printStackTrace();
+         throw new ResourceInitializationException( e );
+      }
+
+      builder.add( AllergyAnnotator.createAnnotatorDescription() );
+
+      builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() );
+      builder.add( PolarityCleartkAnalysisEngine.createAnnotatorDescription() );
+      builder.add( UncertaintyCleartkAnalysisEngine.createAnnotatorDescription() );
+      builder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) );
+      builder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class ) );
+
+      return builder.createAggregateDescription();
+   }
+
+   private static CollectionReader createFilesInDirectoryReader( final String inputDirectory ) throws UIMAException,
+                                                                                                      IOException {
+      final String descriptorPath
+            = FileLocator.getFullPath( "ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml" );
+      return CollectionReaderFactory.createReaderFromPath( descriptorPath,
+            FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+            inputDirectory );
+
+   }
+
+   private static AnalysisEngine createXMIWriter( final String outputDirectory )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngine( XmiWriterCasConsumerCtakes.class,
+            XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR,
+            outputDirectory );
+   }
+
+   public static void runAllergyPipeline( final String inputDirectory,
+                                         final String outputDirectory ) throws UIMAException, IOException {
+      final CollectionReader collectionReader = createFilesInDirectoryReader( inputDirectory );
+      final AnalysisEngineDescription analysisEngineDescription = getPipelineDescription( outputDirectory );
+      final AnalysisEngine xmiWriter = createXMIWriter( outputDirectory );
+      runAllergyPipeline( collectionReader, analysisEngineDescription, xmiWriter );
+   }
+
+   public static void runAllergyPipeline( final CollectionReader collectionReader,
+                                         final AnalysisEngineDescription analysisEngineDescription,
+                                         final AnalysisEngine outputWriter ) throws UIMAException, IOException {
+      SimplePipeline.runPipeline( collectionReader,
+            AnalysisEngineFactory.createEngine( analysisEngineDescription ),
+            outputWriter );
+   }
+
+   static private String getStandardModelPath( final String moduleDirectory ) {
+      return CTAKES_DIR_PREFIX + moduleDirectory + "/model.jar";
+   }
+
+
+   public static void main( final String... args ) throws UIMAException, IOException {
+      final AllergyPipelineOptions options = CliFactory.parseArguments( AllergyPipelineOptions.class, args );
+      runAllergyPipeline( options.getInputDirectory(), options.getOutputDirectory() );
+   }
+
+}