You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/20 00:14:56 UTC

svn commit: r1424215 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/ eval/

Author: stevenbethard
Date: Wed Dec 19 23:14:55 2012
New Revision: 1424215

URL: http://svn.apache.org/viewvc?rev=1424215&view=rev
Log:
Fixes and refactors a bunch of really awful code that was recently introduced.

Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Wed Dec 19 23:14:55 2012
@@ -1,4 +1,5 @@
 package org.apache.ctakes.temporal.ae;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -18,20 +19,15 @@ package org.apache.ctakes.temporal.ae;
  * under the License.
  */
 
-
 import java.io.File;
 import java.io.IOException;
 import java.net.URI;
-//import java.net.URL;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
-//import java.util.logging.Logger;
 
-//import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
 import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
 import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
 import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
@@ -47,10 +43,8 @@ import org.apache.uima.analysis_engine.A
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.classifier.CleartkAnnotator;
-//import org.cleartk.classifier.DataWriter;
 import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.Instance;
-//import org.cleartk.classifier.feature.transform.InstanceDataWriter;
 import org.cleartk.classifier.chunking.BIOChunking;
 import org.cleartk.classifier.feature.extractor.CleartkExtractor;
 import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
@@ -64,67 +58,74 @@ import org.cleartk.classifier.feature.ex
 import org.cleartk.classifier.jar.DefaultDataWriterFactory;
 import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
 import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.ConfigurationParameterFactory;
-import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.util.JCasUtil;
 
-//import com.google.common.base.Charsets;
 import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
-//import com.google.common.io.LineProcessor;
-//import com.google.common.io.Resources;
 
 public class EventAnnotator extends CleartkAnnotator<String> {
 
   public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
 
   @ConfigurationParameter(
-			name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
-			mandatory = false,
-			description = "probability that a negative example should be retained for training")
-  protected Float probabilityOfKeepingANegativeExample = 0.8f;
-  
-  public static final String PARAM_FEATURE_TRIM_ORNOT = "WhetherToDoFeatureSelection";
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected Float probabilityOfKeepingANegativeExample = 1f;
+
+  public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection";
 
   @ConfigurationParameter(
-			name = PARAM_FEATURE_TRIM_ORNOT,
-			mandatory = false,
-			description = "set whether feature selection is used or not")
-  public static Float featureTrim = 0f;
-  
+      name = PARAM_FEATURE_SELECTION_THRESHOLD,
+      mandatory = false,
+      description = "the Chi-squared threshold at which features should be removed")
+  protected Float featureSelectionThreshold = 0f;
+
+  public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+  @ConfigurationParameter(
+      mandatory = false,
+      name = PARAM_FEATURE_SELECTION_URI,
+      description = "provides a URI where the feature selection data will be written")
+  protected URI featureSelectionURI;
+
   public static AnalysisEngineDescription createDataWriterDescription(
-      String dataWriterName,
-      File outputDirectory, float downratio, float featureSelect) throws ResourceInitializationException {
+      Class<?> dataWriter,
+      File outputDirectory,
+      float downratio,
+      float featureSelect) throws ResourceInitializationException {
     return AnalysisEngineFactory.createPrimitiveDescription(
         EventAnnotator.class,
         CleartkAnnotator.PARAM_IS_TRAINING,
         true,
         DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-        dataWriterName,
+        dataWriter,
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
         outputDirectory,
         EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
         downratio,
-        EventAnnotator.PARAM_FEATURE_TRIM_ORNOT,
+        EventAnnotator.PARAM_FEATURE_SELECTION_THRESHOLD,
         featureSelect);
   }
 
   public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
       throws ResourceInitializationException {
-	 AnalysisEngineDescription fsEventAnnotator =AnalysisEngineFactory.createPrimitiveDescription(
+    AnalysisEngineDescription fsEventAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
         EventAnnotator.class,
         CleartkAnnotator.PARAM_IS_TRAINING,
         false,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(modelDirectory, "model.jar"));
-	 ConfigurationParameterFactory.addConfigurationParameter(
-		fsEventAnnotator,	
-		EventAnnotator.PARAM_NB_FS_URI,
-		EventAnnotator.createNbFSURI(modelDirectory) );
-	 
-     return(fsEventAnnotator);
+    ConfigurationParameterFactory.addConfigurationParameter(
+        fsEventAnnotator,
+        EventAnnotator.PARAM_FEATURE_SELECTION_URI,
+        EventAnnotator.createFeatureSelectionURI(modelDirectory));
+
+    return (fsEventAnnotator);
   }
 
   protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
@@ -134,26 +135,16 @@ public class EventAnnotator extends Clea
   private BIOChunking<BaseToken, EntityMention> entityChunking;
 
   private BIOChunking<BaseToken, EventMention> eventChunking;
-  
-  public static final String PARAM_NB_FS_URI = ConfigurationParameterFactory.createConfigurationParameterName(
-		      EventAnnotator.class,
-		      "neighborFsUri");
-
-	  @ConfigurationParameter(
-		  mandatory = false,
-		  description = "provides a URI where the neighbor annotation's feature selection data will be written")
-	  protected URI neighborFsUri;
-		    
-  public static final String FS_NEIGHBOR_EXTRACTOR_KEY = "SelectNeighborFeatures";
-
-  private Chi2NeighborFSExtractor<String> chi2NeighborFsExtractor;
-  
-  
-  public static URI createNbFSURI(File outputDirectoryName) {
-	    File f = new File(outputDirectoryName, FS_NEIGHBOR_EXTRACTOR_KEY + "_Chi2_extractor.dat");
-	    return f.toURI();
-	  }
-  //*****feature selection related parameters
+
+  public static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
+  private Chi2NeighborFSExtractor<String> featureSelectionExtractor;
+
+  public static URI createFeatureSelectionURI(File outputDirectoryName) {
+    return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+  }
+
+  // *****feature selection related parameters
 
   @Override
   public void initialize(UimaContext context) throws ResourceInitializationException {
@@ -167,74 +158,47 @@ public class EventAnnotator extends Clea
     this.eventChunking = new BIOChunking<BaseToken, EventMention>(
         BaseToken.class,
         EventMention.class);
-      
-    //configure FS extractor:
-    if (featureTrim > 0){//if feature selection
-        CombinedExtractor forneighbors    = new CombinedExtractor(
-    			new CoveredTextExtractor(),
-    			new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-    			new TypePathExtractor(BaseToken.class, "partOfSpeech"),
-    			new SurfaceFormFeatureExtractor(),
-    	      	new PhraseExtractor(),
-    	      	new SRLExtractor());
-        
-        try {
-    		this.chi2NeighborFsExtractor = initNbFSExtractor(forneighbors);
-    	} catch (IOException e) {
-    		e.printStackTrace();
-    	}
-    }else{//if no feature selection
-        // add features: word, stem, pos and more
-        this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
-        // try {
-        this.tokenFeatureExtractors.addAll(Arrays.asList(
-        		new CoveredTextExtractor(),
-//            	new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-//            	new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
-//            	new SurfaceFormFeatureExtractor(),
-//            	new PhraseExtractor(),
-            	new SRLExtractor()));
-        		// new CoveredTextToValuesExtractor("ACF", StringToDoublesProcessor.parse("/word_freq.lst")),
-        		// new CoveredTextToValuesExtractor("PCA", StringToDoublesProcessor.parse("/word_pca.lst")),
-        		// new CoveredTextToValuesExtractor("TimPCA", StringToDoublesProcessor.parse("/tim_word_pca.txt"))));
-
-        //add window of features before and after
-        CombinedExtractor subExtractor = new CombinedExtractor(
-        	new CoveredTextExtractor(),
-        	new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-        	new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
-        	new SurfaceFormFeatureExtractor(),
-        	new SRLExtractor());
-
-        this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
-        this.contextFeatureExtractors.add(new CleartkExtractor(
-        	BaseToken.class,
-        	subExtractor,
-        	new Preceding(3),
-        	new Following(3)));
-    }
 
+    CombinedExtractor subExtractor = new CombinedExtractor(
+        new CoveredTextExtractor(),
+        new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+        new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+        new SurfaceFormFeatureExtractor(),
+        new PhraseExtractor(),
+        new SRLExtractor());
+
+    if (featureSelectionThreshold > 0) {
+      this.featureSelectionExtractor = new Chi2NeighborFSExtractor<String>(
+          EventAnnotator.FEATURE_SELECTION_NAME,
+          BaseToken.class,
+          subExtractor,
+          this.featureSelectionThreshold,
+          new Preceding(4),
+          new Following(4));
 
+      if (this.featureSelectionURI != null) {
+        try {
+          this.featureSelectionExtractor.load(this.featureSelectionURI);
+        } catch (IOException e) {
+          throw new ResourceInitializationException(e);
+        }
+      }
+    } else {
+      this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
+      this.tokenFeatureExtractors.add(subExtractor);
+      this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
+      this.contextFeatureExtractors.add(new CleartkExtractor(
+          BaseToken.class,
+          subExtractor,
+          new Preceding(3),
+          new Following(3)));
+    }
   }
 
-
-private Chi2NeighborFSExtractor<String> initNbFSExtractor(
-		CombinedExtractor subextractor) throws IOException{
-
-	Chi2NeighborFSExtractor<String> chi2NbFSExtractor = new  Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, BaseToken.class, subextractor, featureTrim, new Preceding(4),
-	    	new Following(4)); //the 3rd last parameter is used to control chi2 threshold, the last two are used to control window size
-	
-	if (this.neighborFsUri != null) {
-		chi2NbFSExtractor.load(this.neighborFsUri);
-	    }
-	return chi2NbFSExtractor;
-}
-
-
   @Override
   public void process(JCas jCas) throws AnalysisEngineProcessException {
-	  
-	Random rand = new Random();
+
+    Random rand = new Random();
     // classify tokens within each sentence
     for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
       List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
@@ -270,54 +234,62 @@ private Chi2NeighborFSExtractor<String> 
       int tokenIndex = -1;
       int window = 2;
       int nPreviousClassifications = 2;
-      
+
       for (BaseToken token : tokens) {
         ++tokenIndex;
 
         List<Feature> features = new ArrayList<Feature>();
-        
-        if (featureTrim >0 ){//if feature selection
-        	features.addAll(this.chi2NeighborFsExtractor.extract(jCas, token)); //base features
-        	features.addAll(this.chi2NeighborFsExtractor.extractWithin(jCas, token, sentence)); //neighbor features
-        	features.addAll(this.chi2NeighborFsExtractor.extract(entityTypeIDs, entityTagsByType,tokenIndex, window)); // features from surrounding entities
-        	features.addAll(this.chi2NeighborFsExtractor.extract(nPreviousClassifications, tokenIndex, outcomes)); //features from previous classifications
-        }else{ //if no feature selection
-        	// features from token attributes
-            for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
-              features.addAll(extractor.extract(jCas, token));
-            }
-            // features from surrounding tokens
-            for (CleartkExtractor extractor : this.contextFeatureExtractors) {
-              features.addAll(extractor.extractWithin(jCas, token, sentence));
-            }
-            // features from surrounding entities
-            for (int typeID : entityTypeIDs) {
-              List<String> tokenEntityTags = entityTagsByType.get(typeID);
-              int begin = Math.max(tokenIndex - window, 0);
-              int end = Math.min(tokenIndex + window, tokenEntityTags.size());
-              for (int i = begin; i < end; ++i) {
-                String name = String.format("EntityTag_%d_%d", typeID, i - begin);
-                features.add(new Feature(name, tokenEntityTags.get(i)));
-              }
-            }
-            // features from previous classifications
-            for (int i = nPreviousClassifications; i > 0; --i) {
-              int index = tokenIndex - i;
-              String previousOutcome = index < 0 ? "O" : outcomes.get(index);
-              features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+
+        if (featureSelectionThreshold > 0) {// if feature selection
+          features.addAll(this.featureSelectionExtractor.extract(jCas, token)); // base features
+          features.addAll(this.featureSelectionExtractor.extractWithin(jCas, token, sentence)); // neighbor
+          // features
+          features.addAll(this.featureSelectionExtractor.extract(
+              entityTypeIDs,
+              entityTagsByType,
+              tokenIndex,
+              window)); // features from surrounding entities
+          features.addAll(this.featureSelectionExtractor.extract(
+              nPreviousClassifications,
+              tokenIndex,
+              outcomes)); // features from previous classifications
+        } else { // if no feature selection
+          // features from token attributes
+          for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
+            features.addAll(extractor.extract(jCas, token));
+          }
+          // features from surrounding tokens
+          for (CleartkExtractor extractor : this.contextFeatureExtractors) {
+            features.addAll(extractor.extractWithin(jCas, token, sentence));
+          }
+          // features from surrounding entities
+          for (int typeID : entityTypeIDs) {
+            List<String> tokenEntityTags = entityTagsByType.get(typeID);
+            int begin = Math.max(tokenIndex - window, 0);
+            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+            for (int i = begin; i < end; ++i) {
+              String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+              features.add(new Feature(name, tokenEntityTags.get(i)));
             }
+          }
+          // features from previous classifications
+          for (int i = nPreviousClassifications; i > 0; --i) {
+            int index = tokenIndex - i;
+            String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+            features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+          }
         }
-        
+
         // if training, write to data file
         if (this.isTraining()) {
-            String outcome = outcomes.get(tokenIndex);
-            if(outcome.equals("O")){ //if it is an "O". downsample it
-          	  if (rand.nextDouble()<=probabilityOfKeepingANegativeExample)
-          		  this.dataWriter.write(new Instance<String>(outcome, features));
-            }else {
-          	  this.dataWriter.write(new Instance<String>(outcome, features));
-            }
+          String outcome = outcomes.get(tokenIndex);
+          if (outcome.equals("O")) { // if it is an "O". downsample it
+            if (rand.nextDouble() <= probabilityOfKeepingANegativeExample)
+              this.dataWriter.write(new Instance<String>(outcome, features));
+          } else {
+            this.dataWriter.write(new Instance<String>(outcome, features));
           }
+        }
 
         // if predicting, add prediction to outcomes
         else {
@@ -334,47 +306,14 @@ private Chi2NeighborFSExtractor<String> 
 
   private static Predicate<EntityMention> hasEntityType(final int typeID) {
     return new Predicate<EntityMention>() {
+      @Override
       public boolean apply(EntityMention mention) {
         return mention.getTypeID() == typeID;
       }
     };
   }
 
-//  private static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
-//    private Logger logger = Logger.getLogger(this.getClass().getName());
-//
-//    private Map<String, double[]> result = new HashMap<String, double[]>();
-//
-//    private int length = -1;
-//
-//    @Override
-//    public Map<String, double[]> getResult() {
-//      return this.result;
-//    }
-//
-//    @Override
-//    public boolean processLine(String line) throws IOException {
-//      String[] parts = line.trim().split(",");
-//      String key = parts[0];
-//      int partsOffset = 0;
-//      if (this.length == -1) {
-//        this.length = parts.length;
-//      } else if (parts.length != this.length) {
-//        String message = "expected %d parts, found %d, skipping line '%s'";
-//        this.logger.warning(String.format(message, this.length, parts.length, line));
-//        return true;
-//      }
-//      double[] values = new double[parts.length - 1];
-//      for (int i = 0; i < values.length; ++i) {
-//        values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
-//      }
-//      this.result.put(key, values);
-//      return true;
-//    }
-//  }
-
-
-public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
-	return this.chi2NeighborFsExtractor;
-}
+  public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
+    return this.featureSelectionExtractor;
+  }
 }

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java Wed Dec 19 23:14:55 2012
@@ -18,9 +18,14 @@
  */
 package org.apache.ctakes.temporal.ae.feature;
 
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.logging.Logger;
 
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
@@ -28,6 +33,9 @@ import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
 import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
 
+import com.google.common.io.Files;
+import com.google.common.io.LineProcessor;
+
 public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
 
   private String name;
@@ -35,6 +43,43 @@ public class CoveredTextToValuesExtracto
   private Map<String, double[]> textDoublesMap;
 
   private double[] meanValues;
+  
+  public static Map<String, double[]> parseTextDoublesMap(File file, Charset charset) throws IOException {
+    return Files.readLines(file, charset, new StringToDoublesProcessor());
+  }
+
+  static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
+    private Logger logger = Logger.getLogger(this.getClass().getName());
+
+    private Map<String, double[]> result = new HashMap<String, double[]>();
+
+    private int length = -1;
+
+    @Override
+    public Map<String, double[]> getResult() {
+      return this.result;
+    }
+
+    @Override
+    public boolean processLine(String line) throws IOException {
+      String[] parts = line.trim().split(",");
+      String key = parts[0];
+      int partsOffset = 0;
+      if (this.length == -1) {
+        this.length = parts.length;
+      } else if (parts.length != this.length) {
+        String message = "expected %d parts, found %d, skipping line '%s'";
+        this.logger.warning(String.format(message, this.length, parts.length, line));
+        return true;
+      }
+      double[] values = new double[parts.length - 1];
+      for (int i = 0; i < values.length; ++i) {
+        values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
+      }
+      this.result.put(key, values);
+      return true;
+    }
+  }
 
   public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
     super();

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Wed Dec 19 23:14:55 2012
@@ -20,7 +20,6 @@ package org.apache.ctakes.temporal.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.net.URI;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.List;
@@ -32,17 +31,12 @@ import java.util.logging.Level;
 import java.util.logging.LogRecord;
 import java.util.logging.Logger;
 
-import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.transform.InstanceStream;
-import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
 import org.cleartk.eval.AnnotationStatistics;
 import org.cleartk.util.ViewURIUtil;
 import org.uimafit.factory.AggregateBuilder;
@@ -92,25 +86,6 @@ public abstract class EvaluationOfAnnota
     aggregateBuilder.add(this.getPreprocessorTrainDescription());
     aggregateBuilder.add(this.getDataWriterDescription(directory));
     SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
-    
-    if( EventAnnotator.featureTrim > 0 ){
-    	//Extracting features and writing instances
-        Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
-        // Collect MinMax stats for feature normalization
-        URI chi2NbFsURI = EventAnnotator.createNbFSURI(directory);
-        Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, EventAnnotator.featureTrim);
-        chi2NbFsExtractor.train(instances);
-        chi2NbFsExtractor.save(chi2NbFsURI);
-        //now write in the libsvm format
-        this.logger.info("Write out model training data");
-        LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
-        for (Instance<String> instance : instances) {
-          instance = chi2NbFsExtractor.transform(instance);
-          dataWriter.write(instance);
-        }
-        dataWriter.finish();
-    }
-    
     this.trainAndPackage(directory);
   }
 

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Wed Dec 19 23:14:55 2012
@@ -19,29 +19,46 @@
 package org.apache.ctakes.temporal.eval;
 
 import java.io.File;
+import java.net.URI;
 import java.util.Collection;
 import java.util.EnumSet;
 import java.util.List;
 import java.util.logging.Level;
 
 import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.TOP;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Instance;
 import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
 import org.cleartk.classifier.jar.JarClassifierBuilder;
 import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
 import org.cleartk.eval.AnnotationStatistics;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.pipeline.SimplePipeline;
 import org.uimafit.util.JCasUtil;
 
 import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
 
 public class EvaluationOfEventSpans extends EvaluationOfAnnotationSpans_ImplBase {
 
+  static interface Options extends Evaluation_ImplBase.Options {
+
+    @Option(longName = "downratio", defaultValue = "1")
+    public float getProbabilityOfKeepingANegativeExample();
+
+    @Option(longName = "featureSelectionThreshold", defaultValue = "0")
+    public float getFeatureSelectionThreshold();
+  }
+
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
     EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
@@ -49,57 +66,78 @@ public class EvaluationOfEventSpans exte
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory(),
         options.getPatients().getList(),
-        options.getDownSampleRatio(),
-    	options.getFeatureSelect()); //control apply feature selection or not
+        options.getProbabilityOfKeepingANegativeExample(),
+        options.getFeatureSelectionThreshold());
     evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
-    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(4);
+    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(2);
     for (AnnotationStatistics<String> stats : foldStats) {
       System.err.println(stats);
     }
     System.err.println("OVERALL");
     System.err.println(AnnotationStatistics.addAll(foldStats));
   }
-  
-  private float downratio;
-  private float featureTrim;
+
+  private float probabilityOfKeepingANegativeExample;
+
+  private float featureSelectionThreshold;
 
   public EvaluationOfEventSpans(
       File baseDirectory,
       File rawTextDirectory,
       File knowtatorXMLDirectory,
       List<Integer> patientSets,
-      float downratio, float featureSelect) {
-    super(
-        baseDirectory,
-        rawTextDirectory,
-        knowtatorXMLDirectory,
-        patientSets,
-        EnumSet.of(AnnotatorType.PART_OF_SPEECH_TAGS,
+      float probabilityOfKeepingANegativeExample,
+      float featureSelectionThreshold) {
+    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, patientSets, EnumSet.of(
+        AnnotatorType.PART_OF_SPEECH_TAGS));
         //AnnotatorType.UMLS_NAMED_ENTITIES,
-//        AnnotatorType.LEXICAL_VARIANTS,
-        AnnotatorType.DEPENDENCIES,
-        AnnotatorType.SEMANTIC_ROLES));
-    this.downratio = downratio;
-    this.featureTrim = featureSelect;
+        //AnnotatorType.LEXICAL_VARIANTS,
+        //AnnotatorType.DEPENDENCIES,
+        //AnnotatorType.SEMANTIC_ROLES));
+    this.probabilityOfKeepingANegativeExample = probabilityOfKeepingANegativeExample;
+    this.featureSelectionThreshold = featureSelectionThreshold;
   }
 
   @Override
   protected AnalysisEngineDescription getDataWriterDescription(File directory)
       throws ResourceInitializationException {
-	if(this.featureTrim > 0){
-		return EventAnnotator.createDataWriterDescription(
-		    	InstanceDataWriter.class.getName(),
-		        directory,
-		        this.downratio,
-		        this.featureTrim);
-	}
-	return EventAnnotator.createDataWriterDescription(
-	        LIBSVMStringOutcomeDataWriter.class.getName(),
-	        directory,
-	        this.downratio,
-	        this.featureTrim);
-	
-    
+    Class<?> dataWriterClass = this.featureSelectionThreshold > 0f
+        ? InstanceDataWriter.class
+        : LIBSVMStringOutcomeDataWriter.class;
+    return EventAnnotator.createDataWriterDescription(
+        dataWriterClass,
+        directory,
+        this.probabilityOfKeepingANegativeExample,
+        this.featureSelectionThreshold);
+  }
+
+  @Override
+  protected void train(CollectionReader collectionReader, File directory) throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(this.getPreprocessorTrainDescription());
+    aggregateBuilder.add(this.getDataWriterDescription(directory));
+    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+
+    if (this.featureSelectionThreshold > 0) {
+      // Extracting features and writing instances
+      Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+      // Collect MinMax stats for feature normalization
+      URI chi2NbFsURI = EventAnnotator.createFeatureSelectionURI(directory);
+      Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(
+          EventAnnotator.FEATURE_SELECTION_NAME,
+          this.featureSelectionThreshold);
+      chi2NbFsExtractor.train(instances);
+      chi2NbFsExtractor.save(chi2NbFsURI);
+      // now write in the libsvm format
+      LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
+      for (Instance<String> instance : instances) {
+        instance = chi2NbFsExtractor.transform(instance);
+        dataWriter.write(instance);
+      }
+      dataWriter.finish();
+    }
+
+    this.trainAndPackage(directory);
   }
 
   @Override

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Wed Dec 19 23:14:55 2012
@@ -84,12 +84,6 @@ public abstract class Evaluation_ImplBas
 
     @Option(longName = "patients")
     public CommandLine.IntegerRanges getPatients();
-    
-    @Option(longName = "downratio", defaultValue="1")
-    public float getDownSampleRatio();
-
-    @Option(longName = "featureSelect", defaultValue="0")
-    public float getFeatureSelect(); //get feature selection cut off threshold is it is > 0. apply no FS if featureSelect == 0 
   }
 
   protected File rawTextDirectory;