You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/05/09 21:07:27 UTC

svn commit: r1480740 - in /ctakes/trunk/ctakes-assertion: resources/launch/ctakes_assertion_evaluation_on_output.launch src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java

Author: swu
Date: Thu May  9 19:07:27 2013
New Revision: 1480740

URL: http://svn.apache.org/r1480740
Log:
ctakes-assertion: make possible to evaluate whether the manual instance gathering matches gold standard

Added:
    ctakes/trunk/ctakes-assertion/resources/launch/ctakes_assertion_evaluation_on_output.launch
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java

Added: ctakes/trunk/ctakes-assertion/resources/launch/ctakes_assertion_evaluation_on_output.launch
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/resources/launch/ctakes_assertion_evaluation_on_output.launch?rev=1480740&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/resources/launch/ctakes_assertion_evaluation_on_output.launch (added)
+++ ctakes/trunk/ctakes-assertion/resources/launch/ctakes_assertion_evaluation_on_output.launch Thu May  9 19:07:27 2013
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
+<stringAttribute key="bad_container_name" value="/ctakes-assertion/resour"/>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
+<listEntry value="/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java"/>
+</listAttribute>
+<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
+<listEntry value="1"/>
+</listAttribute>
+<stringAttribute key="org.eclipse.debug.ui.ATTR_CAPTURE_IN_FILE" value="/tmp/assertion.log"/>
+<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.ctakes.assertion.eval.AssertionEvaluation"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="--train-dir sharp_data/train --test-dir sharp_data/dev --models-dir sharp_data/model/eval.model --evaluation-output-dir sharp_data/output_instancegathering --eval-only"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-assertion"/>
+<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xmx1600M"/>
+</launchConfiguration>

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1480740&r1=1480739&r2=1480740&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Thu May  9 19:07:27 2013
@@ -20,9 +20,6 @@ package org.apache.ctakes.assertion.eval
 
 import java.io.File;
 import java.io.IOException;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
-import java.net.URI;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -33,6 +30,31 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 
+import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
+import org.apache.ctakes.assertion.medfacts.cleartk.ConditionalCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.HistoryCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.pipelines.GoldEntityAndAttributeReaderPipelineForSeedCorpus;
+import org.apache.ctakes.core.ae.DocumentIdPrinterAnalysisEngine;
+import org.apache.ctakes.core.util.CtakesFileNamer;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
+import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Modifier;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.log4j.Logger;
 import org.apache.uima.UIMAException;
 import org.apache.uima.analysis_engine.AnalysisEngine;
@@ -46,38 +68,22 @@ import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.apache.uima.util.CasCopier;
-import org.apache.uima.util.Level;
-//import org.chboston.cnlp.ctakes.relationextractor.ae.RelationExtractorAnnotator;
-//import org.chboston.cnlp.ctakes.relationextractor.eval.RelationExtractorEvaluation;
-//import org.chboston.cnlp.ctakes.relationextractor.ae.ModifierExtractorAnnotator;
 import org.cleartk.classifier.CleartkAnnotator;
 import org.cleartk.classifier.DataWriterFactory;
 import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
 import org.cleartk.classifier.jar.GenericJarClassifierFactory;
 import org.cleartk.classifier.jar.JarClassifierBuilder;
 import org.cleartk.classifier.opennlp.DefaultMaxentDataWriterFactory;
-import org.cleartk.classifier.opennlp.MaxentStringOutcomeDataWriter;
 import org.cleartk.eval.AnnotationStatistics;
 import org.cleartk.eval.Evaluation_ImplBase;
 import org.cleartk.util.Options_ImplBase;
 import org.kohsuke.args4j.Option;
 import org.kohsuke.args4j.spi.BooleanOptionHandler;
 import org.mitre.medfacts.uima.ZoneAnnotator;
-import org.apache.ctakes.assertion.medfacts.AssertionAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
-import org.apache.ctakes.assertion.medfacts.cleartk.ConditionalCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.HistoryCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
-import org.apache.ctakes.assertion.pipelines.GoldEntityAndAttributeReaderPipelineForSeedCorpus;
-import org.apache.ctakes.core.ae.DocumentIdPrinterAnalysisEngine;
-import org.apache.ctakes.core.util.CtakesFileNamer;
-import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.NoOpAnnotator;
 import org.uimafit.component.xwriter.XWriter;
 import org.uimafit.factory.AggregateBuilder;
 import org.uimafit.factory.AnalysisEngineFactory;
@@ -86,31 +92,13 @@ import org.uimafit.factory.Configuration
 import org.uimafit.factory.TypeSystemDescriptionFactory;
 import org.uimafit.pipeline.JCasIterable;
 import org.uimafit.pipeline.SimplePipeline;
-import org.uimafit.testing.util.HideOutput;
 import org.uimafit.util.JCasUtil;
 
-import com.google.common.base.Function;
 import com.google.common.base.Objects;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
-import org.apache.ctakes.typesystem.type.constants.CONST;
-import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
-import org.apache.ctakes.typesystem.type.relation.RelationArgument;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
-import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
-import org.apache.ctakes.typesystem.type.syntax.NumToken;
-import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
-import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
-import org.apache.ctakes.typesystem.type.syntax.WordToken;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
-import org.apache.ctakes.typesystem.type.textsem.EventMention;
-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
-import org.apache.ctakes.typesystem.type.textsem.Modifier;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
-
 public class AssertionEvaluation extends Evaluation_ImplBase<File, Map<String, AnnotationStatistics>> {
   
   private static Logger logger = Logger.getLogger(AssertionEvaluation.class); 
@@ -219,6 +207,12 @@ public class AssertionEvaluation extends
     		usage = "Flag to have test method print out error context for misclassified examples",
     		required = false)
     public boolean printErrors = false;
+
+    @Option(
+    		name = "--eval-only",
+    		usage = "Evaluate a CASes (supply the directory as an argument) with both system and gold in them.",
+    		required = false)
+    public boolean evalOnly;
   }
   
   protected ArrayList<String> annotationTypes;
@@ -318,9 +312,14 @@ protected static Options options = new O
     // run on test set
     else {
       // train on the entire training set and evaluate on the test set
-      List<File> testFiles = Arrays.asList(options.testDirectory.listFiles());
+      List<File> testFiles;
+      if (options.evalOnly) {
+    	  testFiles = Arrays.asList(options.evaluationOutputDirectory.listFiles());
+      } else {
+    	  testFiles = Arrays.asList(options.testDirectory.listFiles());
+      }
       
-      if (!options.testOnly) {
+      if (!options.testOnly && !options.evalOnly) {
     	  CollectionReader trainCollectionReader = evaluation.getCollectionReader(trainFiles);
     	  evaluation.train(trainCollectionReader, modelsDir);
       }
@@ -608,8 +607,19 @@ public static void printScore(Map<String
     	addCleartkAttributeAnnotatorsToAggregate(directory, builder);
     }
 
-    if (evaluationOutputDirectory != null)
-    {
+    if (options.evalOnly && evaluationOutputDirectory != null) {
+    	// short circuit any other stuff in the pipeline
+    	builder = new AggregateBuilder();
+    	
+    	// uimafit find available type systems on classpath
+    	TypeSystemDescription typeSystemDescription = TypeSystemDescriptionFactory.createTypeSystemDescription();
+    	
+        AnalysisEngineDescription noOp =
+    		AnalysisEngineFactory.createPrimitiveDescription(
+	            NoOpAnnotator.class,
+	            typeSystemDescription);
+    	builder.add(noOp);
+    } else if (evaluationOutputDirectory!=null)  {
         AnalysisEngineDescription xwriter =
     		AnalysisEngineFactory.createPrimitiveDescription(
 	            XWriter.class,
@@ -666,6 +676,7 @@ public static void printScore(Map<String
     	map.put("historyOf", historyStats);
     }
 
+    // run on existing output that has both system (or instance gathering) and gold
     for (JCas jCas : new JCasIterable(collectionReader, aggregate)) {
       JCas goldView;
       try {
@@ -784,15 +795,15 @@ public static void printScore(Map<String
 				  // used for multi-class case:
 				  System.out.println("Incorrectly labeled as " + systemLabel + " when the example was " + goldLabel + ": " + formatError(jCas, goldAnnotation));
 			  }else if(systemLabel.equals(trueCategory)){
-				  System.out.println("False positive: " + formatError(jCas, systemAnnotation));
+				  System.out.println(classifierType+" FP: " + formatError(jCas, systemAnnotation));
 			  }else{
-				  System.out.println("False negative: " + formatError(jCas, goldAnnotation));
+				  System.out.println(classifierType+" FN: " + formatError(jCas, goldAnnotation));
 			  }
 		  }else{
 			  if(systemLabel.equals(trueCategory)){
-				  System.out.println("True positive: " + formatError(jCas, systemAnnotation));
+				  System.out.println(classifierType+" TP: " + formatError(jCas, systemAnnotation));
 			  }else{
-				  System.out.println("True negative: " + formatError(jCas, systemAnnotation));
+				  System.out.println(classifierType+" TN: " + formatError(jCas, systemAnnotation));
 			  }
 		  }
 	  }
@@ -817,10 +828,11 @@ public static void printScore(Map<String
 	  if(context.size() > 0){
 		  Sentence sent = context.get(0);
 		  buff.append(sent.getCoveredText());
-		  int offset = mention.getBegin() - sent.getBegin();
-		  buff.insert(offset, "***");
+		  long offset = mention.getBegin() - sent.getBegin();
+		  if (offset>=Integer.MAX_VALUE || offset<=Integer.MIN_VALUE) { offset=0; } // for spanless annots
+		  buff.insert((int)offset, "***");
 		  offset += (mention.getEnd()-mention.getBegin() + 3);
-		  buff.insert(offset, "***");
+		  buff.insert((int)offset, "***");
 	  }
 	  return buff.toString();
   }
@@ -901,13 +913,13 @@ private void addExternalAttributeAnnotat
 	);
 	builder.add(oldConversionAnnotator);
 
-//	AnalysisEngineDescription oldSubjectAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription("desc/SubjectAttributeAnalysisEngine"); 
-//	ConfigurationParameterFactory.addConfigurationParameters(
-//			oldSubjectAnnotator,
-//			AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-//			AssertionEvalBasedOnModifier.GOLD_VIEW_NAME
-//	);
-//	builder.add(oldSubjectAnnotator);
+	AnalysisEngineDescription oldSubjectAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription("desc/SubjectAttributeAnalysisEngine"); 
+	ConfigurationParameterFactory.addConfigurationParameters(
+			oldSubjectAnnotator,
+			AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
+			AssertionEvaluation.GOLD_VIEW_NAME
+	);
+	builder.add(oldSubjectAnnotator);
 
 	AnalysisEngineDescription oldGenericAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription("desc/GenericAttributeAnalysisEngine"); 
 	ConfigurationParameterFactory.addConfigurationParameters(