You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/02/07 21:38:00 UTC

svn commit: r1565801 - /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java

Author: tmill
Date: Fri Feb  7 20:37:59 2014
New Revision: 1565801

URL: http://svn.apache.org/r1565801
Log:
CTAKES-82: Major changes to assertion eval to make reading other corpora easier.

Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1565801&r1=1565800&r2=1565801&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Fri Feb  7 20:37:59 2014
@@ -20,7 +20,6 @@ package org.apache.ctakes.assertion.eval
 
 import java.io.BufferedWriter;
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.lang.reflect.Constructor;
@@ -29,7 +28,6 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -39,6 +37,7 @@ import java.util.TreeMap;
 import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.AlternateCuePhraseAnnotator;
 import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine.FEATURE_CONFIG;
 import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
 import org.apache.ctakes.assertion.medfacts.cleartk.ConditionalCleartkAnalysisEngine;
 import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
@@ -80,20 +79,21 @@ import org.apache.uima.resource.Resource
 import org.apache.uima.resource.ResourceProcessException;
 import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.apache.uima.util.CasCopier;
+import org.apache.uima.util.FileUtils;
 import org.cleartk.classifier.DataWriter;
-import org.cleartk.classifier.jar.DefaultDataWriterFactory;
 import org.cleartk.classifier.Instance;
 import org.cleartk.classifier.feature.transform.InstanceDataWriter;
 import org.cleartk.classifier.feature.transform.InstanceStream;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
 import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
 import org.cleartk.classifier.jar.GenericJarClassifierFactory;
 import org.cleartk.classifier.jar.JarClassifierBuilder;
-import org.cleartk.classifier.liblinear.LIBLINEARStringOutcomeDataWriter;
+import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
 import org.cleartk.eval.Evaluation_ImplBase;
-import org.cleartk.util.Options_ImplBase;
+import org.cleartk.ml.libsvm.tk.TKLIBSVMStringOutcomeDataWriter;
+import org.kohsuke.args4j.CmdLineParser;
 import org.kohsuke.args4j.Option;
 import org.kohsuke.args4j.spi.BooleanOptionHandler;
-import org.mitre.medfacts.uima.ZoneAnnotator;
 import org.uimafit.component.JCasAnnotator_ImplBase;
 import org.uimafit.component.NoOpAnnotator;
 import org.uimafit.component.xwriter.XWriter;
@@ -120,7 +120,9 @@ private static Logger logger = Logger.ge
 
   private static final String YTEX_NEGATION_DESCRIPTOR = "ytex.uima.NegexAnnotator";
 
-  public static class Options extends Options_ImplBase {
+  enum Corpus {SHARP_SEED, SHARP_STRATIFIED, MIPACQ, I2B2, NEGEX}
+
+  public static class Options {
     @Option(
         name = "--train-dir",
         usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
@@ -252,6 +254,30 @@ private static Logger logger = Logger.ge
     public Float featureSelectionThreshold = null;
 
     @Option(
+        name = "--kernel-params",
+        usage = "Set of parameters to pass to kernel (libsvm)",
+        required = false)
+    public String kernelParams = null;
+    
+    @Option(
+        name = "--use-tmp",
+        usage = "Whether to put trained models into a temp directory (e.g., for a grid search)",
+        required = false)
+    public boolean useTmp = false;
+    
+    @Option(
+        name = "--corpus",
+        usage = "What corpus to read for pre-processing",
+        required = false)
+    public Corpus corpus = Corpus.SHARP_SEED;
+    
+    @Option(
+        name = "--feats",
+        usage = "What feature configuration to use",
+        required = false)
+    public FEATURE_CONFIG featConfig = FEATURE_CONFIG.ALL_SYN;
+
+    @Option(
     		name = "--feda",
     		usage = "Domain adaptation -- for each semicolon-separated directory in train-dir, creates a domain-specific feature space",
     		required = false)
@@ -288,7 +314,9 @@ private static Logger logger = Logger.ge
     
     //Options options = new Options();
 	  resetOptions();
-	  options.parseOptions(args);
+	  CmdLineParser parser = new CmdLineParser(options);
+	  parser.parseArgument(args);
+//	  options.parseOptions(args);
 
 	  if (useEvaluationLogFile && evaluationLogFileOut == null) {
 		  evaluationLogFile = new File(evaluationLogFilePath);
@@ -321,6 +349,15 @@ private static Logger logger = Logger.ge
     }
     //File modelsDir = new File("models/modifier");
     File modelsDir = options.modelsDirectory;
+    if(options.useTmp){
+      File tempModelDir = new File(options.modelsDirectory, "temp");
+      tempModelDir.mkdirs();
+      File curModelDir = File.createTempFile("assertion", null, tempModelDir);
+      curModelDir.delete();
+      curModelDir.mkdir();
+      modelsDir = curModelDir;
+    }
+    
     File evaluationOutputDirectory = options.evaluationOutputDirectory;
 
     // determine the type of classifier to be trained
@@ -347,16 +384,46 @@ private static Logger logger = Logger.ge
     if (!options.ignoreGeneric) { annotationTypes.add("generic"); }
     if (!options.ignoreHistory) { annotationTypes.add("historyOf"); }
     
+    String[] kernelParams = null;
+    if(options.kernelParams != null){
+      kernelParams = options.kernelParams.split("\\s+");
+    }else{
+      kernelParams = new String[]{"-t", "0", "-c", "1"};
+    }
+    Class<? extends DataWriter<String>> dw = null;
+    if(options.featConfig == FEATURE_CONFIG.STK || options.featConfig == FEATURE_CONFIG.PTK){ 
+        dw = TKLIBSVMStringOutcomeDataWriter.class;
+    }else{
+        dw = LIBSVMStringOutcomeDataWriter.class;
+    }
+    
     AssertionEvaluation evaluation = new AssertionEvaluation(
         modelsDir,
         evaluationOutputDirectory,
         annotationTypes,
         annotatorClass,
-        LIBLINEARStringOutcomeDataWriter.class,
-        "-c",
-        "1"
+        dw,
+        kernelParams
+//        "-t",
+//        "0",
+//       TKLIBSVMStringOutcomeDataWriter.class,
+//        "-c",
+//        "1"
 //        "-t",
+//        "5",
+//        "-C",
+//        "+",
+//        "-L",
+//        "0.4",
+//        "-N",
+//        "3",
+//        "-S",
 //        "0"
+       
+//        "-w0",
+//        "100.0",
+//        "-w1",
+//        "1.0"
 //        "100",
 //        "2"
         );
@@ -421,7 +488,9 @@ private static Logger logger = Logger.ge
     	  AssertionEvaluation.printScore(stats,  modelsDir!=null? modelsDir.getAbsolutePath() : "no_model");
       }
     }
-    
+    if(options.useTmp && modelsDir != null){
+      FileUtils.deleteRecursive(modelsDir);
+    }
     System.out.println("Finished assertion module at " + new Date());
     
   }
@@ -570,21 +639,23 @@ public static void printScore(Map<String
 	  File preprocessedDir = null;
 	  if (options.trainDirectory.split("[;]").length>1) {
 		  throw new IOException("Assertion preprocess wants to write to one train directory, but you've supplied multiple: " + options.trainDirectory);
-	  } else {
-		  preprocessedDir = new File(options.trainDirectory);
 	  }
-	  if (rawDir.getAbsolutePath().contains("i2b2")) {
+		preprocessedDir = new File(options.trainDirectory);
+	  
+	  if(options.corpus == Corpus.I2B2){
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readI2B2Challenge2010(rawDir, preprocessedDir);
-		  
-	  } else if (rawDir.getAbsolutePath().contains("mipacq")) {
+	  }else if(options.corpus == Corpus.MIPACQ){
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readMiPACQ(rawDir, preprocessedDir, options.testDirectory, options.devDirectory);
-		  
-	  } else if (rawDir.getAbsolutePath().contains("negex")) {
+	  }else if(options.corpus == Corpus.NEGEX){
 		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readNegexTestSet(rawDir, preprocessedDir);
-		  
-	  } else{
-		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpUmlsCem(
+	  }else if(options.corpus == Corpus.SHARP_STRATIFIED){
+	    GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpStratifiedUmls(
+	        rawDir, preprocessedDir, options.testDirectory, options.devDirectory);
+	  } else if(options.corpus == Corpus.SHARP_SEED){
+		  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpSeedUmls(
 				  rawDir, preprocessedDir, options.testDirectory, options.devDirectory);
+	  } else{
+	    throw new ResourceInitializationException("No corpus type specified!", new Object[]{rawDir});
 	  }
   }
   
@@ -614,22 +685,22 @@ public static void printScore(Map<String
     AnalysisEngineDescription assertionAttributeClearerAnnotator = AnalysisEngineFactory.createPrimitiveDescription(ReferenceAnnotationsSystemAssertionClearer.class);
     builder.add(assertionAttributeClearerAnnotator);
     
-    String generalSectionRegexFileUri =
-        "org/mitre/medfacts/zoner/section_regex.xml";
-    AnalysisEngineDescription zonerAnnotator =
-        AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
-            ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
-            generalSectionRegexFileUri
-            );
+//    String generalSectionRegexFileUri =
+//        "org/mitre/medfacts/zoner/section_regex.xml";
+//    AnalysisEngineDescription zonerAnnotator =
+//        AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
+//            ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
+//            generalSectionRegexFileUri
+//            );
 //    builder.add(zonerAnnotator);
-
-    String mayoSectionRegexFileUri =
-        "org/mitre/medfacts/uima/mayo_sections.xml";
-    AnalysisEngineDescription mayoZonerAnnotator =
-        AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
-            ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
-            mayoSectionRegexFileUri
-            );
+//
+//    String mayoSectionRegexFileUri =
+//        "org/mitre/medfacts/uima/mayo_sections.xml";
+//    AnalysisEngineDescription mayoZonerAnnotator =
+//        AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
+//            ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
+//            mayoSectionRegexFileUri
+//            );
 //    builder.add(mayoZonerAnnotator);
   
 //    URL assertionCuePhraseLookupAnnotatorDescriptorUrl1 = this.getClass().getClassLoader().getResource("org/apache/ctakes/dictionary/lookup/AssertionCuePhraseDictionaryLookupAnnotator.xml");
@@ -683,7 +754,9 @@ public static void printScore(Map<String
     				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
     				PolarityCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "polarity")),
     				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
-    				featureSelectionThreshold
+    				featureSelectionThreshold,
+    				AssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+    				options.featConfig
     				);
     	}
 		builder.add(polarityAnnotator);
@@ -726,7 +799,9 @@ public static void printScore(Map<String
 			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
 			UncertaintyCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "uncertainty")),
 			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
-			featureSelectionThreshold
+			featureSelectionThreshold,
+      AssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+      options.featConfig
 	        );
 	    builder.add(uncertaintyAnnotator);
     }
@@ -1092,14 +1167,14 @@ public static void printScore(Map<String
 	  }
 
 	  // train models based on instances
-	  JarClassifierBuilder.trainAndPackage(directory, "-c", "0.05");
+	  JarClassifierBuilder.trainAndPackage(directory, arguments);
   }
   
   protected Class<? extends DataWriter> getDataWriterClass()
       throws ResourceInitializationException {
     return (options.featureSelectionThreshold!=null)
         ? InstanceDataWriter.class
-        : LIBLINEARStringOutcomeDataWriter.class;
+        : this.dataWriterClass;
   }
   
   private static boolean DEBUG = false;
@@ -1467,22 +1542,22 @@ private void addCleartkAttributeAnnotato
 //	builder.add(cuePhraseLookupAnnotator);
     builder.add(AnalysisEngineFactory.createPrimitiveDescription(AlternateCuePhraseAnnotator.class, new Object[]{}));
 
-	String generalSectionRegexFileUri =
-		"org/mitre/medfacts/zoner/section_regex.xml";
-	AnalysisEngineDescription zonerAnnotator =
-		AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
-				ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
-				generalSectionRegexFileUri
-		);
-//	builder.add(zonerAnnotator);
-
-	String mayoSectionRegexFileUri =
-		"org/mitre/medfacts/uima/mayo_sections.xml";
-	AnalysisEngineDescription mayoZonerAnnotator =
-		AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
-				ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
-				mayoSectionRegexFileUri
-		);
+//	String generalSectionRegexFileUri =
+//		"org/mitre/medfacts/zoner/section_regex.xml";
+//	AnalysisEngineDescription zonerAnnotator =
+//		AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
+//				ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
+//				generalSectionRegexFileUri
+//		);
+////	builder.add(zonerAnnotator);
+//
+//	String mayoSectionRegexFileUri =
+//		"org/mitre/medfacts/uima/mayo_sections.xml";
+//	AnalysisEngineDescription mayoZonerAnnotator =
+//		AnalysisEngineFactory.createPrimitiveDescription(ZoneAnnotator.class,
+//				ZoneAnnotator.PARAM_SECTION_REGEX_FILE_URI,
+//				mayoSectionRegexFileUri
+//		);
 //	builder.add(mayoZonerAnnotator);
 
 	// Add the ClearTk or the ytex negation (polarity) classifier
@@ -1510,7 +1585,9 @@ private void addCleartkAttributeAnnotato
     				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
     				AssertionEvaluation.GOLD_VIEW_NAME,
     				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
-    				new File(new File(directory, "polarity"), "model.jar").getPath()
+    				new File(new File(directory, "polarity"), "model.jar").getPath(),
+            PolarityCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+            options.featConfig
     				);
     		builder.add(polarityAnnotator);
     	}