You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/06/18 21:30:10 UTC

svn commit: r1494265 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: eval/ pipelines/ train/

Author: swu
Date: Tue Jun 18 19:30:10 2013
New Revision: 1494265

URL: http://svn.apache.org/r1494265
Log:
possible to train assertion models with different data sources

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1494265&r1=1494264&r2=1494265&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Tue Jun 18 19:30:10 2013
@@ -237,7 +237,8 @@ protected static Options options = new O
   
   public static void main(String[] args) throws Exception {
     //Options options = new Options();
-    options.parseOptions(args);
+	  resetOptions();
+	  options.parseOptions(args);
     
 //    System.err.println("forcing skipping of subject processing!!!");
 //    options.runSubject = false;
@@ -342,21 +343,48 @@ protected static Options options = new O
     	  CollectionReader trainCollectionReader = evaluation.getCollectionReader(trainFiles);
     	  evaluation.train(trainCollectionReader, modelsDir);
       }
-      if (testFiles==null || testFiles.size()==0) {
-    	  throw new RuntimeException("testFiles = " + testFiles + " testFiles.size() = " + (testFiles==null ? "null": testFiles.size())) ;
-      }
-      logger.debug("testFiles.size() = " + testFiles.size());
-      CollectionReader testCollectionReader = evaluation.getCollectionReader(testFiles);
-      Map<String, AnnotationStatistics> stats = evaluation.test(testCollectionReader, modelsDir);
       
-      AssertionEvaluation.printScore(stats,  modelsDir.getAbsolutePath());
+      if (!options.trainOnly) {
+    	  if (testFiles==null || testFiles.size()==0) {
+    		  throw new RuntimeException("testFiles = " + testFiles + " testFiles.size() = " + (testFiles==null ? "null": testFiles.size())) ;
+    	  }
+    	  logger.debug("testFiles.size() = " + testFiles.size());
+    	  CollectionReader testCollectionReader = evaluation.getCollectionReader(testFiles);
+    	  Map<String, AnnotationStatistics> stats = evaluation.test(testCollectionReader, modelsDir);
+
+    	  AssertionEvaluation.printScore(stats,  modelsDir.getAbsolutePath());
+      }
     }
     
     System.out.println("Finished assertion module.");
     
   }
   
-  private static void printOptionsForDebugging(Options options)
+  private static void resetOptions() {
+	  options.ignoreConditional = false;
+	  options.ignoreGeneric = false;
+	  options.ignoreHistory = false;
+	  options.ignorePolarity = false;
+	  options.ignoreSubject = false;
+	  options.ignoreUncertainty = false;
+	  
+	  options.trainOnly = false;
+	  options.testOnly = false;
+	  options.noCleartk = false;
+	  options.printErrors = false;
+	  options.evalOnly = false;
+	  
+	  options.evaluationOutputDirectory = null;
+	  options.trainDirectory = null;
+	  options.testDirectory = null;
+	  options.devDirectory = null;
+	  options.modelsDirectory = null;
+	  options.preprocessDir = null;
+	  
+	  options.crossValidationFolds = null;
+  }
+
+private static void printOptionsForDebugging(Options options)
   {
 	System.out.format(
 		"training dir: %s%n" +
@@ -442,6 +470,7 @@ public static void printScore(Map<String
 		  trainDir = new File(options.trainDirectory);
 	  }
 	  if (preprocessDir.getName().contains("i2b2")) {
+		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(preprocessDir, trainDir);
 		  
 	  } else {
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpUmlsCem(

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java?rev=1494265&r1=1494264&r2=1494265&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java Tue Jun 18 19:30:10 2013
@@ -37,6 +37,7 @@ import org.uimafit.factory.TypeSystemDes
 import org.uimafit.pipeline.SimplePipeline;
 
 import org.apache.ctakes.assertion.cr.GoldEntityAndAttributeReader;
+import org.apache.ctakes.assertion.cr.I2B2Challenge2010CollectionReader;
 import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
 import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 
@@ -207,4 +208,46 @@ public class GoldEntityAndAttributeReade
 
 		logger.info("Finished!");
 	}
+	
+	public static void readI2B2Challenge2010(File parentDirectory, File trainDirectory)
+	throws ResourceInitializationException, UIMAException, IOException {
+
+		TypeSystemDescription typeSystemDescription = 
+			// use the uimafit method of finding available type system
+			// descriptor via META-INF/org.uimafit/types.txt 
+			// (found in ctakes-type-system/src/main/resources)
+			TypeSystemDescriptionFactory.createTypeSystemDescription();
+
+		AggregateBuilder aggregate = new AggregateBuilder();
+
+		CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+				I2B2Challenge2010CollectionReader.class,
+				typeSystemDescription,
+				"inputDir",
+				parentDirectory
+		);
+
+		// fill in other values that are necessary for preprocessing
+		AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
+				"desc/analysis_engine/AttributeDiscoveryPreprocessor"
+		);
+		aggregate.add(preprocessAnnotator);
+
+		if (trainDirectory!=null) {
+			AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
+					XWriter.class,
+					typeSystemDescription,
+					XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+					trainDirectory,
+					XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+					CtakesFileNamer.class.getName()
+			);
+			aggregate.add(xWriter2);
+			//		SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
+		}
+
+		SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
+		logger.info("Finished!");
+	}
+
 }

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java?rev=1494265&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java Tue Jun 18 19:30:10 2013
@@ -0,0 +1,64 @@
+package org.apache.ctakes.assertion.train;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+
+import scala.actors.threadpool.Arrays;
+
+public class CrossValidateAttributeModels {
+
+	private static ArrayList<String> annotationTypes = new ArrayList<String>();
+	static { 
+		annotationTypes.add("polarity");
+		annotationTypes.add("conditional");
+		annotationTypes.add("uncertainty");
+		annotationTypes.add("subject");
+		annotationTypes.add("generic");
+		annotationTypes.add("historyOf");
+	}
+
+	// Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
+	private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
+	static { 
+		trainingDirectories.put("polarity","sharp_data/dev");
+		trainingDirectories.put("conditional","sharp_data/dev");
+		trainingDirectories.put("uncertainty","sharp_data/dev");
+		trainingDirectories.put("subject","sharp_data/dev");
+		trainingDirectories.put("generic","sharp_data/dev");
+		trainingDirectories.put("historyOf","sharp_data/dev");
+	}
+	public static void main(String[] args) throws Exception {
+		
+		for (String attribute : annotationTypes) {
+			
+			ArrayList<String> params = new ArrayList<String>();
+
+			params.add("--train-dir"); 			params.add(trainingDirectories.get(attribute));
+			params.add("--models-dir"); 		params.add("sharp_data/model/eval.model");
+			params.add("--cross-validation"); 	params.add("5");
+			
+			// Build up an "ignore" string
+			for (String ignoreAttribute : annotationTypes) {
+				if (!ignoreAttribute.equals(attribute)) { 
+
+					if (ignoreAttribute.equals("historyOf")) {
+						ignoreAttribute = ignoreAttribute.substring(0, ignoreAttribute.length()-2);
+					}
+
+					params.add("--ignore-" + ignoreAttribute);
+				}
+			}
+			String[] paramList = params.toArray(new String[]{});
+			
+			System.out.println(Arrays.asList(paramList).toString());
+			
+			// Run the actual assertion training on just one attribute
+			AssertionEvaluation.main( paramList );
+		}
+		
+		
+		
+	}
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java?rev=1494265&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java Tue Jun 18 19:30:10 2013
@@ -0,0 +1,66 @@
+package org.apache.ctakes.assertion.train;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+
+import scala.actors.threadpool.Arrays;
+
+public class TrainAttributeModels {
+
+	private static ArrayList<String> annotationTypes = new ArrayList<String>();
+	static { 
+		annotationTypes.add("polarity");
+		annotationTypes.add("conditional");
+		annotationTypes.add("uncertainty");
+		annotationTypes.add("subject");
+		annotationTypes.add("generic");
+		annotationTypes.add("historyOf");
+	}
+
+	// Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
+	private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
+	static { 
+		trainingDirectories.put("polarity","sharp_data/train:i2b2_data/train");
+		trainingDirectories.put("conditional","sharp_data/train");
+		trainingDirectories.put("uncertainty","sharp_data/train");
+		trainingDirectories.put("subject","sharp_data/train");
+		trainingDirectories.put("generic","sharp_data/train");
+		trainingDirectories.put("historyOf","sharp_data/train");
+	}
+	public static void main(String[] args) throws Exception {
+		
+		for (String attribute : annotationTypes) {
+			
+			ArrayList<String> params = new ArrayList<String>();
+
+			params.add("--train-dir"); 	params.add(trainingDirectories.get(attribute));
+//			params.add("--test-dir"); 	params.add("sharp_data/dev");
+			params.add("--models-dir"); params.add("sharp_data/model/eval.model");
+//			params.add("--evaluation-output-dir");	params.add("sharp_data/output"); 
+			params.add("--train-only"); 
+			
+			// Build up an "ignore" string
+			for (String ignoreAttribute : annotationTypes) {
+				if (!ignoreAttribute.equals(attribute)) { 
+
+					if (ignoreAttribute.equals("historyOf")) {
+						ignoreAttribute = ignoreAttribute.substring(0, ignoreAttribute.length()-2);
+					}
+
+					params.add("--ignore-" + ignoreAttribute);
+				}
+			}
+			String[] paramList = params.toArray(new String[]{});
+			
+			System.out.println(Arrays.asList(paramList).toString());
+			
+			// Run the actual assertion training on just one attribute
+			AssertionEvaluation.main( paramList );
+		}
+		
+		
+		
+	}
+}