You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/08/19 20:17:09 UTC

svn commit: r1515543 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: eval/AssertionEvaluation.java pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java

Author: swu
Date: Mon Aug 19 18:17:09 2013
New Revision: 1515543

URL: http://svn.apache.org/r1515543
Log:
trying to get mipacq and negex test set to read and train/test correctly

Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1515543&r1=1515542&r2=1515543&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Mon Aug 19 18:17:09 2013
@@ -544,20 +544,26 @@ public static void printScore(Map<String
         paths);
   }
 
-  public static void preprocess(File preprocessDir ) throws ResourceInitializationException, UIMAException, IOException {
+  public static void preprocess(File rawDir ) throws ResourceInitializationException, UIMAException, IOException {
 //	  File devDirectory = new File(options.trainDirectory.getParentFile() + File.separator + "dev");
-	  File trainDir = null;
+	  File preprocessedDir = null;
 	  if (options.trainDirectory.split("[;:]").length>1) {
 		  throw new IOException("Assertion preprocess wants to write to one train directory, but you've supplied multiple: " + options.trainDirectory);
 	  } else {
-		  trainDir = new File(options.trainDirectory);
+		  preprocessedDir = new File(options.trainDirectory);
 	  }
-	  if (preprocessDir.getAbsolutePath().contains("i2b2")) {
-		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(preprocessDir, trainDir);
+	  if (rawDir.getAbsolutePath().contains("i2b2")) {
+		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(rawDir, preprocessedDir);
 		  
-	  } else {
+	  } else if (rawDir.getAbsolutePath().contains("mipacq")) {
+		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readMiPACQ(rawDir, preprocessedDir);
+		  
+	  } else if (rawDir.getAbsolutePath().contains("negex")) {
+		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readNegexTestSet(rawDir, preprocessedDir);
+		  
+	  } else{
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpUmlsCem(
-				  preprocessDir, trainDir, options.testDirectory, options.devDirectory);
+				  rawDir, preprocessedDir, options.testDirectory, options.devDirectory);
 	  }
   }
   
@@ -1182,8 +1188,8 @@ private static void printErrors(JCas jCa
 		  } else  {
 			  if(!goldLabel.equals(systemLabel)){
 				  if(trueCategory == null){
-					  // used for multi-class case:
-					  System.out.println(classifierType+" Incorrectly labeled as " + systemLabel + " when the example was " + goldLabel + ": " + formatError(jCas, goldAnnotation));
+					  // used for multi-class case.  Incorrect_system_label(Correct_label):
+					  System.out.println(classifierType+ " "+ systemLabel + "(" + goldLabel + "): " + formatError(jCas, systemAnnotation));
 				  }else if(systemLabel.equals(trueCategory)){
 					  System.out.println(classifierType+" FP: " + typeId  + " " + formatError(jCas, systemAnnotation) + "| gold:|" + formatError(jCas, goldAnnotation) + " " + documentId);
 				  }else{
@@ -1192,6 +1198,7 @@ private static void printErrors(JCas jCa
 			  }else{
 			    if(trueCategory == null){
 			      // multi-class case -- probably don't want to print anything?
+			    	System.out.println(classifierType+ " "+ systemLabel + "(" + goldLabel + "): " + formatError(jCas, systemAnnotation));
 			    }else if(systemLabel.equals(trueCategory)){
 					  System.out.println(classifierType+" TP: " + typeId + " " + formatError(jCas, systemAnnotation) + "| gold:|" + formatError(jCas, goldAnnotation) + " " + documentId);
 				  }else{

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java?rev=1515543&r1=1515542&r2=1515543&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java Mon Aug 19 18:17:09 2013
@@ -38,6 +38,8 @@ import org.uimafit.pipeline.SimplePipeli
 
 import org.apache.ctakes.assertion.cr.GoldEntityAndAttributeReader;
 import org.apache.ctakes.assertion.cr.I2B2Challenge2010CollectionReader;
+import org.apache.ctakes.assertion.cr.MiPACQKnowtatorXMLReader;
+import org.apache.ctakes.assertion.cr.NegExCorpusReader;
 import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
 import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 
@@ -209,7 +211,7 @@ public class GoldEntityAndAttributeReade
 		logger.info("Finished!");
 	}
 	
-	public static void readI2B2Challenge2010(File parentDirectory, File trainDirectory)
+	public static void readI2B2Challenge2010(File parentDirectory, File preprocessedDirectory)
 	throws ResourceInitializationException, UIMAException, IOException {
 
 		TypeSystemDescription typeSystemDescription = 
@@ -233,12 +235,96 @@ public class GoldEntityAndAttributeReade
 		);
 		aggregate.add(preprocessAnnotator);
 
-		if (trainDirectory!=null) {
+		if (preprocessedDirectory!=null) {
 			AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
 					XWriter.class,
 					typeSystemDescription,
 					XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
-					trainDirectory,
+					preprocessedDirectory,
+					XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+					CtakesFileNamer.class.getName()
+			);
+			aggregate.add(xWriter2);
+			//		SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
+		}
+
+		SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
+		logger.info("Finished!");
+	}
+
+	public static void readNegexTestSet(File inputFile, File preprocessedDirectory)
+	throws ResourceInitializationException, UIMAException, IOException {
+
+		TypeSystemDescription typeSystemDescription = 
+			TypeSystemDescriptionFactory.createTypeSystemDescription();
+
+		AggregateBuilder aggregate = new AggregateBuilder();
+
+		// input dir is hard-coded in AssertionConst
+		CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+				NegExCorpusReader.class,
+				typeSystemDescription
+		);
+
+		// fill in other values that are necessary for preprocessing
+		AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
+				"desc/analysis_engine/AttributeDiscoveryPreprocessor"
+		);
+		aggregate.add(preprocessAnnotator);
+
+		if (preprocessedDirectory!=null) {
+			AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
+					XWriter.class,
+					typeSystemDescription,
+					XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+					preprocessedDirectory,
+					XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+					CtakesFileNamer.class.getName()
+			);
+			aggregate.add(xWriter2);
+			//		SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
+		}
+
+		SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
+		logger.info("Finished!");
+	}
+
+	public static void readMiPACQ(File inputDirectory, File preprocessedDirectory)
+	throws ResourceInitializationException, UIMAException, IOException {
+
+		TypeSystemDescription typeSystemDescription = 
+			TypeSystemDescriptionFactory.createTypeSystemDescription();
+
+		AggregateBuilder aggregate = new AggregateBuilder();
+
+		CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+				FilesInDirectoryCollectionReader.class,
+				typeSystemDescription,
+				"InputDirectory",
+				inputDirectory
+				);
+		
+		// read the UMLS_CEM data from Knowtator
+		AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+				MiPACQKnowtatorXMLReader.class,
+				typeSystemDescription,
+				MiPACQKnowtatorXMLReader.PARAM_TEXT_DIRECTORY,
+				preprocessedDirectory
+		);
+		
+		aggregate.add(goldAnnotator);
+		// fill in other values that are necessary for preprocessing
+		AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
+				"desc/analysis_engine/AttributeDiscoveryPreprocessor"
+		);
+		aggregate.add(preprocessAnnotator);
+
+		if (preprocessedDirectory!=null) {
+			AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
+					XWriter.class,
+					typeSystemDescription,
+					XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+					preprocessedDirectory,
 					XWriter.PARAM_FILE_NAMER_CLASS_NAME,
 					CtakesFileNamer.class.getName()
 			);