You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/02/06 23:12:38 UTC

svn commit: r1565459 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines: GoldEntityAndAttributeReaderPipelineForSeedCorpus.java SharpCorpusSplit.java

Author: tmill
Date: Thu Feb  6 22:12:38 2014
New Revision: 1565459

URL: http://svn.apache.org/r1565459
Log:
CTAKES-94: Reads sharp seed/stratified, mipacq, i2b2 data, and has the train/test splits for sharp data.

Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java?rev=1565459&r1=1565458&r2=1565459&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java Thu Feb  6 22:12:38 2014
@@ -24,6 +24,12 @@ import java.io.FileFilter;
 import java.io.IOException;
 import java.util.HashMap;
 
+import org.apache.ctakes.assertion.cr.I2B2Challenge2010CollectionReader;
+import org.apache.ctakes.assertion.cr.MiPACQKnowtatorXMLReader;
+import org.apache.ctakes.assertion.cr.NegExCorpusReader;
+import org.apache.ctakes.assertion.pipelines.SharpCorpusSplit.Subcorpus;
+import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 import org.apache.log4j.Logger;
 import org.apache.uima.UIMAException;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -36,12 +42,7 @@ import org.uimafit.factory.AnalysisEngin
 import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.factory.TypeSystemDescriptionFactory;
 import org.uimafit.pipeline.SimplePipeline;
-import org.apache.ctakes.assertion.cr.GoldEntityAndAttributeReader;
-import org.apache.ctakes.assertion.cr.I2B2Challenge2010CollectionReader;
-import org.apache.ctakes.assertion.cr.MiPACQKnowtatorXMLReader;
-import org.apache.ctakes.assertion.cr.NegExCorpusReader;
-import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
-import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import com.google.common.base.Function;
 
 /**
  * 
@@ -70,15 +71,15 @@ public class GoldEntityAndAttributeReade
 		//String parentDirectoryString = "/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/Seattle Group Health/UMLS_CEM";
 
 		File parentDirectory = new File(parentDirectoryString);
-		readSharpUmlsCem(parentDirectory);
+		readSharpSeedUmlsCem(parentDirectory);
 		
 	}
 
-	public static void readSharpUmlsCem(File parentDirectory) throws ResourceInitializationException, UIMAException, IOException {
-		readSharpUmlsCem(parentDirectory, null, null, null);
+	public static void readSharpSeedUmlsCem(File parentDirectory) throws ResourceInitializationException, UIMAException, IOException {
+		readSharpSeedUmlsCem(parentDirectory, null, null, null);
 	}
 	
-	public static void readSharpUmlsCem(File parentDirectory, File trainDirectory, File testDirectory, File devDirectory)
+	public static void readSharpSeedUmlsCem(File parentDirectory, File trainDirectory, File testDirectory, File devDirectory)
 			throws ResourceInitializationException, UIMAException, IOException {
 //		logger.info("parent directory: " + parentDirectoryString);
 //		File parentDirectory = new File(parentDirectoryString);
@@ -176,7 +177,7 @@ public class GoldEntityAndAttributeReade
 			
 			if (trainDirectory!=null && testDirectory!=null && devDirectory!=null) {
 				File subcorpusDirectory;
-				switch (SharpCorpusSplit.split(currentBatchDirectory)) {
+				switch (SharpCorpusSplit.splitSeed(currentBatchDirectory)) {
 				case TRAIN: 
 					subcorpusDirectory = trainDirectory;
 					break;
@@ -211,6 +212,103 @@ public class GoldEntityAndAttributeReade
 		logger.info("Finished!");
 	}
 	
+	public static void readSharpStratifiedUmls(File releaseDirectory, File trainDirectory, File testDirectory, File devDirectory) throws UIMAException, IOException{
+	  File mayoStrat = new File(releaseDirectory, "SHARP/MayoStrat/by-batch/umls");
+	  // sghStrat not annotated yet...
+	  File sghStrat = new File(releaseDirectory, "SHARP/SGHStrat1/by-batch/umls");
+	  
+	  readSharpUmls(new File[] {mayoStrat, sghStrat}, trainDirectory, testDirectory, devDirectory, 
+	      new Function<File,Subcorpus>(){
+	    public Subcorpus apply(File f){
+	      return SharpCorpusSplit.splitStratified(f);
+	  }
+	  });
+	}
+	
+	public static void readSharpSeedUmls(File releaseDirectory, File trainDirectory, File testDirectory, File devDirectory) throws UIMAException, IOException{
+	  File seed1 = new File(releaseDirectory, "SHARP/SeedSet1/by-batch/umls");
+	  readSharpUmls(new File[] {seed1}, trainDirectory, testDirectory, devDirectory,
+	       new Function<File,Subcorpus>(){
+	      public Subcorpus apply(File f){
+	        return SharpCorpusSplit.splitSeed(f);
+	    }
+	    });
+
+	}
+	
+	public static void readSharpUmls(File[] sections, File trainDirectory, File testDirectory, File devDirectory, Function<File,Subcorpus> splitFunction) throws UIMAException, IOException{
+	  for(File section : sections){
+	    File[] batches = section.listFiles(new FileFilter(){
+
+        @Override
+        public boolean accept(File pathname) {
+          return pathname.isDirectory();
+        }});
+	    for(File batchDir : batches){
+	      TypeSystemDescription typeSystemDescription = 
+	          // use the uimafit method of finding available type system
+	          // descriptor via META-INF/org.uimafit/types.txt 
+	          // (found in ctakes-type-system/src/main/resources)
+	        TypeSystemDescriptionFactory.createTypeSystemDescription();
+	      
+	      File textDirectory = new File(batchDir, "text");
+	      AggregateBuilder aggregate = new AggregateBuilder();
+	      
+	      CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+	          FilesInDirectoryCollectionReader.class,
+	          typeSystemDescription,
+	          "InputDirectory",
+	          textDirectory.toString()
+	          );
+	      
+	      // read the UMLS_CEM data from Knowtator
+	      AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+	          SHARPKnowtatorXMLReader.class,
+	          typeSystemDescription,
+	          "TextDirectory", // 3/13/13 halgrim changed from "TextURI" trying to work with new SHARPKnowtatorXMLReader.java
+	          //"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/knowtator/"
+	          textDirectory.toString() + "/"
+	      );
+	      aggregate.add(goldAnnotator);
+
+	      // fill in other values that are necessary for preprocessing
+	      AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
+	          "desc/analysis_engine/AttributeDiscoveryPreprocessor"
+	          );
+	      aggregate.add(preprocessAnnotator);
+
+	      File subcorpusDir = null;
+//	      Subcorpus subcorpus = SharpCorpusSplit.splitStratified(Integer.parseInt(batchDir.getName()));
+	      Subcorpus subcorpus = splitFunction.apply(batchDir);
+	      switch(subcorpus){
+	      case TRAIN:
+	        subcorpusDir = trainDirectory;
+	        break;
+	      case DEV:
+	        subcorpusDir = devDirectory;
+	        break;
+	      case TEST:
+	        subcorpusDir = testDirectory;
+	        break;
+	      default:
+	        subcorpusDir = trainDirectory;
+	      }
+	      
+	       AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
+	            XWriter.class,
+	            typeSystemDescription,
+	            XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+	            subcorpusDir,
+	            XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+	            CtakesFileNamer.class.getName()
+	        );
+	       aggregate.add(xWriter);
+	       SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
+
+	    }
+	  }
+	}
+
 	public static void readI2B2Challenge2010(File parentDirectory, File preprocessedDirectory)
 	throws ResourceInitializationException, UIMAException, IOException {
 

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java?rev=1565459&r1=1565458&r2=1565459&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java Thu Feb  6 22:12:38 2014
@@ -4,6 +4,11 @@ import java.io.File;
 import java.util.HashMap;
 import java.util.Map;
 
+/*
+ * See here for training/dev/test split information:
+ * http://informatics.mayo.edu/sharp/index.php/Annotation#Training.2FDevelopment.2FTest_Split
+ */
+
 public class SharpCorpusSplit {
 	public enum Subcorpus { TRAIN, TEST, DEV, CROSSVAL }
 	
@@ -29,12 +34,25 @@ public class SharpCorpusSplit {
 		map.put("ss1_batch19", Subcorpus.TRAIN); 
 	}
 	
-	public static Subcorpus split( File directory ) {
-		if (map.containsKey(directory.getName())) {
-//			System.out.println(directory.toString());
-			return map.get(directory.getName());
-		} else {
-			return Subcorpus.TRAIN;
-		}
+	public static Subcorpus splitSeed( File directory ) {
+//		if (map.containsKey(directory.getName())) {
+////			System.out.println(directory.toString());
+//			return map.get(directory.getName());
+//		}
+//		return Subcorpus.TRAIN;
+		int batchNum = Integer.parseInt(directory.getName());
+		if(batchNum == 10 || batchNum == 17) return Subcorpus.DEV;
+		else if(batchNum == 11 || batchNum == 12) return Subcorpus.TEST;
+		else return Subcorpus.TRAIN;
+	}
+	
+	public static Subcorpus splitStratified(int batchNum){
+	  if(batchNum % 5 < 3) return Subcorpus.TRAIN;
+	  else if(batchNum % 5 == 3) return Subcorpus.DEV;
+	  else return Subcorpus.TEST;
+	}
+	
+	public static Subcorpus splitStratified(File file){
+	  return splitStratified(Integer.parseInt(file.getName()));
 	}
 }