You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/06/19 22:55:39 UTC
svn commit: r1494773 - in
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion:
cr/ eval/ train/ util/
Author: swu
Date: Wed Jun 19 20:55:38 2013
New Revision: 1494773
URL: http://svn.apache.org/r1494773
Log:
Preprocess, Train, and Crossvalidate for assertion module all available programmatically instead of through launches
Added:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java
Modified:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/I2B2Challenge2010CollectionReader.java Wed Jun 19 20:55:38 2013
@@ -28,9 +28,9 @@ import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.ctakes.typesystem.type.refsem.Entity;
+import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.log4j.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
@@ -138,31 +138,34 @@ public class I2B2Challenge2010Collection
if(word2char.containsKey(pair)){
int charOffset = word2char.get(pair);
int end = charOffset + m.group(1).length();
- Entity entity = new Entity(jcas);
- EntityMention mention = new EntityMention(jcas, charOffset, end);
+// Entity entity = new Entity(jcas);
+ EventMention mention = new EventMention(jcas, charOffset, end);
+
// set default values...
- mention.setPolarity(1);
- mention.setConditional(false);
- mention.setUncertainty(-1);
- mention.setGeneric(false);
- mention.setSubject("patient");
+ mention.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
+ mention.setConditional(CONST.NE_CONDITIONAL_FALSE);
+ mention.setUncertainty(CONST.NE_UNCERTAINTY_ABSENT);
+ mention.setGeneric(CONST.NE_GENERIC_FALSE);
+ mention.setSubject(CONST.ATTR_SUBJECT_PATIENT);
+
+ // set non-default values. mappings follow MITRE's conventions (see AssertionAnalysisEngine)
if(m.group(7).equals("absent")){
// negSet.add(charOffset+"-"+end);
- mention.setPolarity(-1);
+ mention.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
}else if(m.group(7).equals("hypothetical")){
// hypothSet.add(charOffset+"-"+end);
- mention.setGeneric(true);
+ mention.setConditional(CONST.NE_CONDITIONAL_TRUE);
}else if(m.group(7).equals("possible")){
// possSet.add(charOffset+"-"+end);
- mention.setUncertainty(1);
+ mention.setUncertainty(CONST.NE_UNCERTAINTY_PRESENT);
}else if(m.group(7).equals("associated_with_someone_else")){
// nasSet.add(charOffset+"-"+end);
- mention.setSubject("other");
- }else if(m.group(7).equals("conditional")){
-// condSet.add(charOffset+"-"+end);
- mention.setConditional(true);
-// }else if(m.group(7).equals("present")){
-// presSet.add(charOffset+"-"+end); // NOTE: There is no "present" setting, it is an inference from other things not being set.
+ mention.setSubject(CONST.ATTR_SUBJECT_FAMILY_MEMBER); // the most common non-patient case
+ }else if(m.group(7).equals("conditional")){ // no good mapping.
+//// condSet.add(charOffset+"-"+end);
+// mention.setConditional(true);
+//// }else if(m.group(7).equals("present")){
+//// presSet.add(charOffset+"-"+end); // NOTE: There is no "present" setting, it is an inference from other things not being set.
}
mention.addToIndexes();
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Wed Jun 19 20:55:38 2013
@@ -115,7 +115,7 @@ public class AssertionEvaluation extends
@Option(
name = "--train-dir",
usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
- required = true)
+ required = false)
public String trainDirectory;
@Option(
@@ -133,7 +133,7 @@ public class AssertionEvaluation extends
@Option(
name = "--models-dir",
usage = "specify the directory where the models will be placed",
- required = true)
+ required = false)
public File modelsDirectory;
@Option(
@@ -251,12 +251,14 @@ protected static Options options = new O
// System.err.println("forcing skipping of conditional processing!!!");
// options.runConditional = false;
printOptionsForDebugging(options);
- String[] dirs = options.trainDirectory.split("[;:]");
List<File> trainFiles = new ArrayList<File>();
- for (String dir : dirs) {
- File trainDir = new File(dir);
- trainFiles.addAll(Arrays.asList(trainDir.listFiles()));
- System.out.println(trainFiles.toString());
+ if (null != options.trainDirectory) {
+ String[] dirs = options.trainDirectory.split("[;:]");
+ for (String dir : dirs) {
+ File trainDir = new File(dir);
+ trainFiles.addAll(Arrays.asList(trainDir.listFiles()));
+ // System.out.println(trainFiles.toString());
+ }
}
//File modelsDir = new File("models/modifier");
File modelsDir = options.modelsDirectory;
@@ -304,7 +306,7 @@ protected static Options options = new O
}
// run cross-validation
- else if(options.testDirectory == null || options.crossValidationFolds != null) {
+ else if(options.crossValidationFolds != null) {
// run n-fold cross-validation
List<Map<String, AnnotationStatistics>> foldStats = evaluation.crossValidation(trainFiles, options.crossValidationFolds);
//AnnotationStatistics overallStats = AnnotationStatistics.addAll(foldStats);
@@ -335,6 +337,8 @@ protected static Options options = new O
if (options.evalOnly) {
testFiles = Arrays.asList(options.evaluationOutputDirectory.listFiles());
logger.debug("evalOnly using files in directory " + evaluationOutputDirectory.getName() + " aka " + evaluationOutputDirectory.getCanonicalPath());
+ } else if (options.trainOnly){
+ testFiles = new ArrayList<File>();
} else {
testFiles = Arrays.asList(options.testDirectory.listFiles());
}
@@ -400,7 +404,7 @@ private static void printOptionsForDebug
"%n%n",
options.trainDirectory,
(options.testDirectory != null) ? options.testDirectory.getAbsolutePath() : "",
- options.modelsDirectory.getAbsolutePath(),
+ (options.modelsDirectory!=null) ? options.modelsDirectory.getAbsolutePath() : "",
options.crossValidationFolds,
options.ignorePolarity,
options.ignoreConditional,
@@ -469,7 +473,7 @@ public static void printScore(Map<String
} else {
trainDir = new File(options.trainDirectory);
}
- if (preprocessDir.getName().contains("i2b2")) {
+ if (preprocessDir.getAbsolutePath().contains("i2b2")) {
GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(preprocessDir, trainDir);
} else {
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -4,43 +4,24 @@ import java.util.ArrayList;
import java.util.HashMap;
import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
import scala.actors.threadpool.Arrays;
public class CrossValidateAttributeModels {
- private static ArrayList<String> annotationTypes = new ArrayList<String>();
- static {
- annotationTypes.add("polarity");
- annotationTypes.add("conditional");
- annotationTypes.add("uncertainty");
- annotationTypes.add("subject");
- annotationTypes.add("generic");
- annotationTypes.add("historyOf");
- }
-
- // Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
- private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
- static {
- trainingDirectories.put("polarity","sharp_data/dev");
- trainingDirectories.put("conditional","sharp_data/dev");
- trainingDirectories.put("uncertainty","sharp_data/dev");
- trainingDirectories.put("subject","sharp_data/dev");
- trainingDirectories.put("generic","sharp_data/dev");
- trainingDirectories.put("historyOf","sharp_data/dev");
- }
public static void main(String[] args) throws Exception {
- for (String attribute : annotationTypes) {
+ for (String attribute : AssertionConst.annotationTypes) {
ArrayList<String> params = new ArrayList<String>();
- params.add("--train-dir"); params.add(trainingDirectories.get(attribute));
+ params.add("--train-dir"); params.add(AssertionConst.trainingDirectories.get(attribute));
params.add("--models-dir"); params.add("sharp_data/model/eval.model");
params.add("--cross-validation"); params.add("5");
// Build up an "ignore" string
- for (String ignoreAttribute : annotationTypes) {
+ for (String ignoreAttribute : AssertionConst.annotationTypes) {
if (!ignoreAttribute.equals(attribute)) {
if (ignoreAttribute.equals("historyOf")) {
@@ -52,7 +33,7 @@ public class CrossValidateAttributeModel
}
String[] paramList = params.toArray(new String[]{});
- System.out.println(Arrays.asList(paramList).toString());
+// System.out.println(Arrays.asList(paramList).toString());
// Run the actual assertion training on just one attribute
AssertionEvaluation.main( paramList );
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java?rev=1494773&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/ReadAndPreprocessForAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -0,0 +1,44 @@
+package org.apache.ctakes.assertion.train;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
+
+import scala.actors.threadpool.Arrays;
+
+public class ReadAndPreprocessForAttributeModels {
+
+ public static void main(String[] args) throws Exception {
+
+ for (String source : AssertionConst.preprocessRootDirectory.keySet()) {
+
+ ArrayList<String> params = new ArrayList<String>();
+
+ // Always preprocess something to a main directory, usually for training
+ params.add("--train-dir"); params.add(AssertionConst.preprocessRootDirectory.get(source));
+
+ // Some corpora (SHARP) may have predetermined dev/test splits. Check AssertionConst.
+ if (AssertionConst.preprocessForDev.containsKey(source) ) {
+ params.add("--dev-dir"); params.add(AssertionConst.preprocessForDev.get(source));
+ }
+ if (AssertionConst.preprocessForTest.containsKey(source) ) {
+ params.add("--test-dir"); params.add(AssertionConst.preprocessForTest.get(source));
+ }
+
+ // Specify preprocessing directory (See AssertionConst)
+ params.add("--preprocess-only"); params.add(source);
+
+ String[] paramList = params.toArray(new String[]{});
+
+// System.out.println(Arrays.asList(paramList).toString());
+
+ // Run the actual assertion preprocessing on just one data source
+ AssertionEvaluation.main( paramList );
+ }
+
+
+
+ }
+}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java?rev=1494773&r1=1494772&r2=1494773&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java Wed Jun 19 20:55:38 2013
@@ -4,45 +4,26 @@ import java.util.ArrayList;
import java.util.HashMap;
import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.util.AssertionConst;
import scala.actors.threadpool.Arrays;
public class TrainAttributeModels {
- private static ArrayList<String> annotationTypes = new ArrayList<String>();
- static {
- annotationTypes.add("polarity");
- annotationTypes.add("conditional");
- annotationTypes.add("uncertainty");
- annotationTypes.add("subject");
- annotationTypes.add("generic");
- annotationTypes.add("historyOf");
- }
-
- // Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
- private static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
- static {
- trainingDirectories.put("polarity","sharp_data/train:i2b2_data/train");
- trainingDirectories.put("conditional","sharp_data/train");
- trainingDirectories.put("uncertainty","sharp_data/train");
- trainingDirectories.put("subject","sharp_data/train");
- trainingDirectories.put("generic","sharp_data/train");
- trainingDirectories.put("historyOf","sharp_data/train");
- }
public static void main(String[] args) throws Exception {
- for (String attribute : annotationTypes) {
+ for (String attribute : AssertionConst.annotationTypes) {
ArrayList<String> params = new ArrayList<String>();
- params.add("--train-dir"); params.add(trainingDirectories.get(attribute));
+ params.add("--train-dir"); params.add(AssertionConst.trainingDirectories.get(attribute));
// params.add("--test-dir"); params.add("sharp_data/dev");
params.add("--models-dir"); params.add("sharp_data/model/eval.model");
// params.add("--evaluation-output-dir"); params.add("sharp_data/output");
params.add("--train-only");
// Build up an "ignore" string
- for (String ignoreAttribute : annotationTypes) {
+ for (String ignoreAttribute : AssertionConst.annotationTypes) {
if (!ignoreAttribute.equals(attribute)) {
if (ignoreAttribute.equals("historyOf")) {
@@ -54,7 +35,7 @@ public class TrainAttributeModels {
}
String[] paramList = params.toArray(new String[]{});
- System.out.println(Arrays.asList(paramList).toString());
+// System.out.println(Arrays.asList(paramList).toString());
// Run the actual assertion training on just one attribute
AssertionEvaluation.main( paramList );
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java?rev=1494773&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java Wed Jun 19 20:55:38 2013
@@ -0,0 +1,70 @@
+package org.apache.ctakes.assertion.util;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class AssertionConst {
+
+ /*** CHANGE THESE ***/
+
+ // Locally-stored data models
+
+ // expects subdirectories: "Mayo/UMLS_CEM/*batch*/Knowtator*" "Seattle Group Health/UMLS_CEM/*batch*/Knowtator*"
+ public static final String SHARP_SEED_CORPUS = "/Users/m081914/work/data/sharp/Seed Corpus";
+ // expects subdirectories: ast, txt
+ public static final String I2B2_2010_CORPUS = "/Users/m081914/work/data/i2b2Challenge2010/Data/i2b2Challenge2010AllTrain";
+ // expects subdirectories: ast, txt
+ public static final String I2B2_2010_TEST_CORPUS = "/Users/m081914/work/data/i2b2Challenge2010/Data/Test/reports";
+
+ // Specify training directories for each attribute in a (semi)colon-separated list, e.g., "sharp_data/dev:sharp_data/train"
+ public static HashMap<String,String> trainingDirectories = new HashMap<String,String>();
+ static {
+ trainingDirectories.put("polarity","sharp_data/train:i2b2_data/train");
+// trainingDirectories.put("polarity","sharp_data/train");
+// trainingDirectories.put("polarity","i2b2_data/train");
+ trainingDirectories.put("conditional","sharp_data/train");
+ trainingDirectories.put("uncertainty","sharp_data/train");
+ trainingDirectories.put("subject","sharp_data/train");
+ trainingDirectories.put("generic","sharp_data/train");
+ trainingDirectories.put("historyOf","sharp_data/train");
+ }
+
+ // If you don't want to train/cross-validate everything, comment these out
+ public static ArrayList<String> annotationTypes = new ArrayList<String>();
+ static {
+ annotationTypes.add("polarity");
+ annotationTypes.add("conditional");
+ annotationTypes.add("uncertainty");
+ annotationTypes.add("subject");
+ annotationTypes.add("generic");
+ annotationTypes.add("historyOf");
+ }
+
+
+ /*** DON'T CHANGE THESE ***/
+
+ // Specify input and output data locations for preprocessing. Results will be used for model training
+ public static HashMap<String,String> preprocessRootDirectory = new HashMap<String,String>();
+ static {
+ preprocessRootDirectory.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/train");
+ preprocessRootDirectory.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/train");
+ preprocessRootDirectory.put(I2B2_2010_CORPUS,"i2b2_data/train");
+ preprocessRootDirectory.put(I2B2_2010_TEST_CORPUS,"i2b2_data/test");
+ }
+
+ // Specify input and output data locations for preprocessing. Results will be used for model test
+ public static HashMap<String,String> preprocessForTest = new HashMap<String,String>();
+ static {
+ preprocessForTest.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/test");
+ preprocessForTest.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/test");
+ }
+
+ // Specify input and output data locations for preprocessing. Results will be used for model dev
+ public static HashMap<String,String> preprocessForDev = new HashMap<String,String>();
+ static {
+ preprocessForDev.put(SHARP_SEED_CORPUS+"/Mayo/UMLS_CEM","sharp_data/dev");
+ preprocessForDev.put(SHARP_SEED_CORPUS+"/Seattle Group Health/UMLS_CEM","sharp_data/dev");
+ }
+
+
+}