You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/06/19 22:55:39 UTC

svn commit: r1494773 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: cr/ eval/ train/ util/

Author: swu
Date: Wed Jun 19 20:55:38 2013
New Revision: 1494773

URL: http://svn.apache.org/r1494773
Log:
Preprocess, Train, and Crossvalidate for assertion module all available programmatically instead of through launches

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java Wed Jun 19 20:55:38 2013
@@ -28,9 +28,9 @@ import java.util.Scanner;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.ctakes.typesystem.type.refsem.Entity;
+import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.log4j.Logger;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
@@ -138,31 +138,34 @@ public class I2B2Challenge2010Collection
 					if(word2char.containsKey(pair)){
 						int charOffset = word2char.get(pair);
 						int end = charOffset + m.group(1).length();
-						Entity entity = new Entity(jcas);
-						EntityMention mention = new EntityMention(jcas, charOffset, end);
+//						Entity entity = new Entity(jcas);
+						EventMention mention = new EventMention(jcas, charOffset, end);
+
 						// set default values...
-						mention.setPolarity(1);
-						mention.setConditional(false);
-						mention.setUncertainty(-1);
-						mention.setGeneric(false);
-						mention.setSubject("patient");
+						mention.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
+						mention.setConditional(CONST.NE_CONDITIONAL_FALSE);
+						mention.setUncertainty(CONST.NE_UNCERTAINTY_ABSENT);
+						mention.setGeneric(CONST.NE_GENERIC_FALSE);
+						mention.setSubject(CONST.ATTR_SUBJECT_PATIENT);
+
+						// set non-default values. mappings follow MITRE's conventions (see AssertionAnalysisEngine)
 						if(m.group(7).equals("absent")){
 //							negSet.add(charOffset+"-"+end);
-							mention.setPolarity(-1);
+							mention.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
 						}else if(m.group(7).equals("hypothetical")){
 //							hypothSet.add(charOffset+"-"+end);
-							mention.setGeneric(true);
+							mention.setConditional(CONST.NE_CONDITIONAL_TRUE);
 						}else if(m.group(7).equals("possible")){
 //							possSet.add(charOffset+"-"+end);
-							mention.setUncertainty(1);
+							mention.setUncertainty(CONST.NE_UNCERTAINTY_PRESENT);
 						}else if(m.group(7).equals("associated_with_someone_else")){
 //							nasSet.add(charOffset+"-"+end);
-							mention.setSubject("other");
-						}else if(m.group(7).equals("conditional")){
-//							condSet.add(charOffset+"-"+end);
-							mention.setConditional(true);
-//						}else if(m.group(7).equals("present")){
-//							presSet.add(charOffset+"-"+end);    // NOTE: There is no "present" setting, it is an inference from other things not being set.
+							mention.setSubject(CONST.ATTR_SUBJECT_FAMILY_MEMBER); // the most common non-patient case
+						}else if(m.group(7).equals("conditional")){ // no good mapping.
+////							condSet.add(charOffset+"-"+end);
+//							mention.setConditional(true);
+////						}else if(m.group(7).equals("present")){
+////							presSet.add(charOffset+"-"+end);    // NOTE: There is no "present" setting, it is an inference from other things not being set.
 						}
 						mention.addToIndexes();
 					}

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Wed Jun 19 20:55:38 2013
@@ -115,7 +115,7 @@ public class AssertionEvaluation extends
     @Option(
         name = "--train-dir",
         usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
-        required = true)
+        required = false)
     public String trainDirectory;
     
     @Option(
@@ -133,7 +133,7 @@ public class AssertionEvaluation extends
     @Option(
         name = "--models-dir",
         usage = "specify the directory where the models will be placed",
-        required = true)
+        required = false)
     public File modelsDirectory;
     
     @Option(
@@ -251,12 +251,14 @@ protected static Options options = new O
 //    System.err.println("forcing skipping of conditional processing!!!");
 //    options.runConditional = false;
     printOptionsForDebugging(options);
-    String[] dirs = options.trainDirectory.split("[;:]");
     List<File> trainFiles = new ArrayList<File>();
-    for (String dir : dirs) {
-    	File trainDir = new File(dir);
-    	trainFiles.addAll(Arrays.asList(trainDir.listFiles()));
-    	System.out.println(trainFiles.toString());
+    if (null != options.trainDirectory) {
+    	String[] dirs = options.trainDirectory.split("[;:]");
+    	for (String dir : dirs) {
+    		File trainDir = new File(dir);
+    		trainFiles.addAll(Arrays.asList(trainDir.listFiles()));
+    		//    	System.out.println(trainFiles.toString());
+    	}
     }
     //File modelsDir = new File("models/modifier");
     File modelsDir = options.modelsDirectory;
@@ -304,7 +306,7 @@ protected static Options options = new O
     }
     
     // run cross-validation
-    else if(options.testDirectory == null || options.crossValidationFolds != null) {
+    else if(options.crossValidationFolds != null) {
       // run n-fold cross-validation
       List<Map<String, AnnotationStatistics>> foldStats = evaluation.crossValidation(trainFiles, options.crossValidationFolds);
       //AnnotationStatistics overallStats = AnnotationStatistics.addAll(foldStats);
@@ -335,6 +337,8 @@ protected static Options options = new O
       if (options.evalOnly) {
     	  testFiles = Arrays.asList(options.evaluationOutputDirectory.listFiles());
     	  logger.debug("evalOnly using files in directory " + evaluationOutputDirectory.getName() + " aka " + evaluationOutputDirectory.getCanonicalPath());
+      } else if (options.trainOnly){
+    	  testFiles = new ArrayList<File>();
       } else {
     	  testFiles = Arrays.asList(options.testDirectory.listFiles());
       }
@@ -400,7 +404,7 @@ private static void printOptionsForDebug
 	    "%n%n",
 	    options.trainDirectory,
 	    (options.testDirectory != null) ? options.testDirectory.getAbsolutePath() : "",
-	    options.modelsDirectory.getAbsolutePath(),
+	    (options.modelsDirectory!=null) ? options.modelsDirectory.getAbsolutePath() : "",
 	    options.crossValidationFolds,
 	    options.ignorePolarity,
 	    options.ignoreConditional,
@@ -469,7 +473,7 @@ public static void printScore(Map<String
 	  } else {
 		  trainDir = new File(options.trainDirectory);
 	  }
-	  if (preprocessDir.getName().contains("i2b2")) {
+	  if (preprocessDir.getAbsolutePath().contains("i2b2")) {
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(preprocessDir, trainDir);
 		  
 	  } else {

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -4,43 +4,24 @@ import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
 
 import scala.actors.threadpool.Arrays;
 
 public class CrossValidateAttributeModels {
 
-	private static ArrayList<String> annotationTypes = new ArrayList<String>();
-	static { 
-		annotationTypes.add("polarity");
-		annotationTypes.add("conditional");
-		annotationTypes.add("uncertainty");
-		annotationTypes.add("subject");
-		annotationTypes.add("generic");
-		annotationTypes.add("historyOf");
-	}
-
-	// Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
-	private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
-	static { 
-		trainingDirectories.put("polarity","sharp_data/dev");
-		trainingDirectories.put("conditional","sharp_data/dev");
-		trainingDirectories.put("uncertainty","sharp_data/dev");
-		trainingDirectories.put("subject","sharp_data/dev");
-		trainingDirectories.put("generic","sharp_data/dev");
-		trainingDirectories.put("historyOf","sharp_data/dev");
-	}
 	public static void main(String[] args) throws Exception {
 		
-		for (String attribute : annotationTypes) {
+		for (String attribute : AssertionConst.annotationTypes) {
 			
 			ArrayList<String> params = new ArrayList<String>();
 
-			params.add("--train-dir"); 			params.add(trainingDirectories.get(attribute));
+			params.add("--train-dir"); 			params.add(AssertionConst.trainingDirectories.get(attribute));
 			params.add("--models-dir"); 		params.add("sharp_data/model/eval.model");
 			params.add("--cross-validation"); 	params.add("5");
 			
 			// Build up an "ignore" string
-			for (String ignoreAttribute : annotationTypes) {
+			for (String ignoreAttribute : AssertionConst.annotationTypes) {
 				if (!ignoreAttribute.equals(attribute)) { 
 
 					if (ignoreAttribute.equals("historyOf")) {
@@ -52,7 +33,7 @@ public class CrossValidateAttributeModel
 			}
 			String[] paramList = params.toArray(new String[]{});
 			
-			System.out.println(Arrays.asList(paramList).toString());
+//			System.out.println(Arrays.asList(paramList).toString());
 			
 			// Run the actual assertion training on just one attribute
 			AssertionEvaluation.main( paramList );

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java?rev=1494773&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -0,0 +1,44 @@
+package org.apache.ctakes.assertion.train;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
+
+import scala.actors.threadpool.Arrays;
+
+public class ReadAndPreprocessForAttributeModels {
+
+	public static void main(String[] args) throws Exception {
+		
+		for (String source : AssertionConst.preprocessRootDirectory.keySet()) {
+			
+			ArrayList<String> params = new ArrayList<String>();
+
+			// Always preprocess something to a main directory, usually for training
+			params.add("--train-dir"); 		params.add(AssertionConst.preprocessRootDirectory.get(source));
+
+			// Some corpora (SHARP) may have predetermined dev/test splits. Check AssertionConst.
+			if (AssertionConst.preprocessForDev.containsKey(source) ) {
+				params.add("--dev-dir"); 	params.add(AssertionConst.preprocessForDev.get(source));
+			}
+			if (AssertionConst.preprocessForTest.containsKey(source) ) {
+				params.add("--test-dir"); 	params.add(AssertionConst.preprocessForTest.get(source));
+			}
+			
+			// Specify preprocessing directory (See AssertionConst)
+			params.add("--preprocess-only"); 	params.add(source);
+			
+			String[] paramList = params.toArray(new String[]{});
+			
+//			System.out.println(Arrays.asList(paramList).toString());
+			
+			// Run the actual assertion preprocessing on just one data source
+			AssertionEvaluation.main( paramList );
+		}
+		
+		
+		
+	}
+}

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -4,45 +4,26 @@ import java.util.ArrayList;
 import java.util.HashMap;
 
 import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
 
 import scala.actors.threadpool.Arrays;
 
 public class TrainAttributeModels {
 
-	private static ArrayList<String> annotationTypes = new ArrayList<String>();
-	static { 
-		annotationTypes.add("polarity");
-		annotationTypes.add("conditional");
-		annotationTypes.add("uncertainty");
-		annotationTypes.add("subject");
-		annotationTypes.add("generic");
-		annotationTypes.add("historyOf");
-	}
-
-	// Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
-	private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
-	static { 
-		trainingDirectories.put("polarity","sharp_data/train:i2b2_data/train");
-		trainingDirectories.put("conditional","sharp_data/train");
-		trainingDirectories.put("uncertainty","sharp_data/train");
-		trainingDirectories.put("subject","sharp_data/train");
-		trainingDirectories.put("generic","sharp_data/train");
-		trainingDirectories.put("historyOf","sharp_data/train");
-	}
 	public static void main(String[] args) throws Exception {
 		
-		for (String attribute : annotationTypes) {
+		for (String attribute : AssertionConst.annotationTypes) {
 			
 			ArrayList<String> params = new ArrayList<String>();
 
-			params.add("--train-dir"); 	params.add(trainingDirectories.get(attribute));
+			params.add("--train-dir"); 	params.add(AssertionConst.trainingDirectories.get(attribute));
 //			params.add("--test-dir"); 	params.add("sharp_data/dev");
 			params.add("--models-dir"); params.add("sharp_data/model/eval.model");
 //			params.add("--evaluation-output-dir");	params.add("sharp_data/output"); 
 			params.add("--train-only"); 
 			
 			// Build up an "ignore" string
-			for (String ignoreAttribute : annotationTypes) {
+			for (String ignoreAttribute : AssertionConst.annotationTypes) {
 				if (!ignoreAttribute.equals(attribute)) { 
 
 					if (ignoreAttribute.equals("historyOf")) {
@@ -54,7 +35,7 @@ public class TrainAttributeModels {
 			}
 			String[] paramList = params.toArray(new String[]{});
 			
-			System.out.println(Arrays.asList(paramList).toString());
+//			System.out.println(Arrays.asList(paramList).toString());
 			
 			// Run the actual assertion training on just one attribute
 			AssertionEvaluation.main( paramList );

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java?rev=1494773&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java Wed Jun 19 20:55:38 2013
@@ -0,0 +1,70 @@
+package org.apache.ctakes.assertion.util;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class AssertionConst {
+	
+	/*** CHANGE THESE ***/
+
+	// Locally-stored data models
+	
+	// expects subdirectories: "Mayo/UMLS_CEM/*batch*/Knowtator*" "Seattle Group Health/UMLS_CEM/*batch*/Knowtator*"
+	public static final String SHARP_SEED_CORPUS = "/Users/m081914/work/data/sharp/Seed Corpus";
+	// expects subdirectories: ast, txt 
+	public static final String I2B2_2010_CORPUS = "/Users/m081914/work/data/i2b2Challenge2010/Data/i2b2Challenge2010AllTrain";
+	// expects subdirectories: ast, txt
+	public static final String I2B2_2010_TEST_CORPUS = "/Users/m081914/work/data/i2b2Challenge2010/Data/Test/reports";
+
+	// Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
+	public static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
+	static { 
+		trainingDirectories.put("polarity","sharp_data/train:i2b2_data/train");
+//		trainingDirectories.put("polarity","sharp_data/train");
+//		trainingDirectories.put("polarity","i2b2_data/train");
+		trainingDirectories.put("conditional","sharp_data/train");
+		trainingDirectories.put("uncertainty","sharp_data/train");
+		trainingDirectories.put("subject","sharp_data/train");
+		trainingDirectories.put("generic","sharp_data/train");
+		trainingDirectories.put("historyOf","sharp_data/train");
+	}
+		
+	// If you don't want to train/cross-validate everything, comment these out
+	public static ArrayList<String> annotationTypes = new ArrayList<String>();
+	static { 
+		annotationTypes.add("polarity");
+		annotationTypes.add("conditional");
+		annotationTypes.add("uncertainty");
+		annotationTypes.add("subject");
+		annotationTypes.add("generic");
+		annotationTypes.add("historyOf");
+	}
+	
+	
+	/*** DON'T CHANGE THESE ***/
+
+	// Specify input and output data locations for preprocessing.  Results will be used for model training
+	public static HashMap<String,String> preprocessRootDirectory = new HashMap<String,String>();
+	static { 
+		preprocessRootDirectory.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/train");
+		preprocessRootDirectory.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/train");
+		preprocessRootDirectory.put(I2B2_2010_CORPUS,"i2b2_data/train");
+		preprocessRootDirectory.put(I2B2_2010_TEST_CORPUS,"i2b2_data/test");
+	}
+	
+	// Specify input and output data locations for preprocessing.  Results will be used for model test
+	public static HashMap<String,String> preprocessForTest = new HashMap<String,String>();
+	static { 
+		preprocessForTest.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/test");
+		preprocessForTest.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/test");
+	}
+
+	// Specify input and output data locations for preprocessing.  Results will be used for model dev
+	public static HashMap<String,String> preprocessForDev = new HashMap<String,String>();
+	static { 
+		preprocessForDev.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/dev");
+		preprocessForDev.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/dev");
+	}
+	
+	
+}