You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/07/15 21:51:22 UTC

svn commit: r1503438 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: attributes/features/ attributes/features/selection/ eval/ medfacts/cleartk/ train/

Author: swu
Date: Mon Jul 15 19:51:22 2013
New Revision: 1503438

URL: http://svn.apache.org/r1503438
Log:
chi2 feature selection and lots of associated stuff

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java   (with props)
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java   (with props)
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java   (with props)
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java   (with props)
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java Mon Jul 15 19:51:22 2013
@@ -51,7 +51,8 @@ public class GenericFeaturesExtractor im
 		// Pull in general dependency-based features -- externalize to another extractor?
 	    ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
 	    if (node!= null) {
-	    	features.add(new Feature("DEPENDENCY_HEAD", node));
+	    	features.add(new Feature("DEPENDENCY_HEAD", node.getCoveredText()));
+	    	features.add(new Feature("DEPENDENCY_HEAD_deprel", node.getDeprel()));
 		}
 	    
 	    HashMap<String, Boolean> featsMap = GenericAttributeClassifier.extract(jCas, arg);

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java Mon Jul 15 19:51:22 2013
@@ -51,7 +51,8 @@ public class SubjectFeaturesExtractor im
 		// Pull in general dependency-based features -- externalize to another extractor?
 	    ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
 	    if (node!= null) {
-	    	features.add(new Feature("DEPENDENCY_HEAD", node));
+	    	features.add(new Feature("DEPENDENCY_HEAD", node.getCoveredText()));
+	    	features.add(new Feature("DEPENDENCY_HEAD_deprel", node.getDeprel()));
 		}
 	    
 	    HashMap<String, Boolean> featsMap = SubjectAttributeClassifier.extract(jCas, arg);

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,197 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Table;
+
+/**
+ * 
+ * Selects features via Chi-squared statistics between the features extracted from its sub-extractor
+ * and the outcome values they are paired with in classification instances.
+ * 
+ * @author Chen Lin
+ * 
+ */
+public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
+
+  /**
+   * Helper class for aggregating and computing mutual Chi2 statistics
+   */
+  private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
+    protected Multiset<OUTCOME_T> classCounts;
+
+    protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+    public Chi2Scorer() {
+      this.classCounts = HashMultiset.<OUTCOME_T> create();
+      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+    }
+
+    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+      Integer count = this.featValueClassCount.get(featureName, outcome);
+      if (count == null) {
+        count = 0;
+      }
+      this.featValueClassCount.put(featureName, outcome, count + occurrences);
+      this.classCounts.add(outcome, occurrences);
+    }
+    
+    public Double apply(String featureName) {
+      return this.score(featureName);
+    }
+
+    public double score(String featureName) {
+      // notation index of 0 means false, 1 mean true
+      // Contingency Table:
+      //      | class1  | class2  | class3  | sum
+      // posi |         |         |         | posiFeatCount
+      // nega |         |         |         | negaFeatCount
+      //      | outcnt1 | outcnt2 | outcnt3 | n
+
+      int numOfClass = this.classCounts.elementSet().size();
+      int[] posiOutcomeCounts = new int[numOfClass];
+      int[] outcomeCounts = new int[numOfClass];
+      int classId = 0;
+      int posiFeatCount = 0;
+      for (OUTCOME_T clas : this.classCounts.elementSet()) {
+        posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
+            ? this.featValueClassCount.get(featureName, clas)
+            : 0;
+        posiFeatCount += posiOutcomeCounts[classId];
+        outcomeCounts[classId] = this.classCounts.count(clas);
+        classId++;
+      }
+
+      int n = this.classCounts.size();
+      int negaFeatCount = n - posiFeatCount;
+
+      double chi2val = 0.0;
+
+      if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
+                                                      // feature, degree of freedom = 0
+        return chi2val;
+      }
+
+      boolean yates = true;
+      for (int lbl = 0; lbl < numOfClass; lbl++) {
+        // for positive part of feature:
+        double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
+        if (expected > 0) {
+          double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
+          if (yates) { // apply Yate's correction
+            diff -= 0.5;
+          }
+          if (diff > 0)
+            chi2val += Math.pow(diff, 2) / expected;
+        }
+
+        // for negative part of feature:
+        expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
+        double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
+        if (expected > 0) {
+          double diff = Math.abs(observ - expected);
+          if (yates) { // apply Yate's correction
+            diff -= 0.5;
+          }
+          if (diff > 0)
+            chi2val += Math.pow(diff, 2) / expected;
+        }
+      }
+
+      return chi2val;
+    }
+  }
+
+  private double chi2Threshold;
+
+  private Chi2Scorer<OUTCOME_T> chi2Function;
+
+  public Chi2FeatureSelection(String name) {
+    this(name, 0.0);
+  }
+
+  public Chi2FeatureSelection(String name, double threshold) {
+    super(name);
+    this.chi2Threshold = threshold;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+  }
+
+  @Override
+  public void train(Iterable<Instance<OUTCOME_T>> instances) {
+    // aggregate statistics for all features
+    this.chi2Function = new Chi2Scorer<OUTCOME_T>();
+    for (Instance<OUTCOME_T> instance : instances) {
+      OUTCOME_T outcome = instance.getOutcome();
+      for (Feature feature : instance.getFeatures()) {
+        if (this.isTransformable(feature)) {
+          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+            this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
+          }
+        }
+      }
+    }
+    // keep only large chi2 valued features
+    this.selectedFeatureNames = Sets.newHashSet();
+    for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
+      if (this.chi2Function.score(featureName) > this.chi2Threshold) {
+        this.selectedFeatureNames.add(featureName);
+      }
+    }
+
+    this.isTrained = true;
+  }
+
+  @Override
+  public void save(URI uri) throws IOException {
+    if (!this.isTrained) {
+      throw new IllegalStateException("Cannot save before training");
+    }
+    File out = new File(uri);
+    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+    Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
+    for (String feature : ordering.immutableSortedCopy(this.selectedFeatureNames)) {
+      writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
+    }
+
+    writer.close();
+  }
+
+  @Override
+  public void load(URI uri) throws IOException {
+    this.selectedFeatureNames = Sets.newLinkedHashSet();
+    File in = new File(uri);
+    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+    // The lines are <feature-name>\t<feature-score>
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      String[] featureValuePair = line.split("\t");
+      this.selectedFeatureNames.add(featureValuePair[0]);
+    }
+
+    reader.close();
+    this.isTrained = true;
+
+  }
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,64 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.Lists;
+
+public abstract class FeatureSelection<OUTCOME_T> extends
+    TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
+
+  protected boolean isTrained;
+  
+  protected Set<String> selectedFeatureNames;
+
+  public FeatureSelection(String name) {
+    super(name);
+    this.isTrained = false;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+  }
+
+  @Override
+  public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
+    List<Feature> features = new ArrayList<Feature>();
+    for (Feature feature : instance.getFeatures()) {
+      if (this.isTransformable(feature)) {
+        // Filter down to selected features
+        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
+      } else {
+        // Pass non-relevant features through w/o filtering
+        features.add(feature);
+      }
+    }
+    return new Instance<OUTCOME_T>(instance.getOutcome(), features);
+  }
+
+  public List<Feature> transform(List<Feature> features) {
+    List<Feature> results = Lists.newArrayList();
+    if (this.isTrained) {
+      results.addAll(Collections2.filter(features, this));
+    } else {
+      results.add(new TransformableFeature(this.name, features));
+    }
+    return results;
+  }
+
+  protected String getFeatureName(Feature feature) {
+    String featureName = feature.getName();
+    Object featureValue = feature.getValue();
+    return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+  }
+
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,267 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Table;
+
+/**
+ * <br>
+ * Copyright (c) 2007-2012, Regents of the University of Colorado <br>
+ * All rights reserved.
+ * <p>
+ * 
+ * Selects features via mutual information statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Lee Becker
+ * 
+ */
+public class MutualInformationFeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
+
+  /**
+   * Specifies how scores for each outcome should be combined/aggregated into a single score
+   */
+  public static enum CombineScoreMethod implements Function<Map<?, Double>, Double> {
+    AVERAGE {
+      public Double apply(Map<?, Double> input) {
+        Collection<Double> scores = input.values();
+        int size = scores.size();
+        double total = 0;
+        for (Double score : scores) {
+          total += score;
+        }
+        return total / size;
+      }
+    },
+    MAX {
+      @Override
+      public Double apply(Map<?, Double> input) {
+        return Ordering.natural().max(input.values());
+      }
+    }
+  }
+
+  /**
+   * Helper class for aggregating and computing mutual information statistics
+   */
+  public static class MutualInformationStats<OUTCOME_T> {
+    protected Multiset<OUTCOME_T> classCounts;
+
+    protected Table<String, OUTCOME_T, Integer> classConditionalCounts;
+
+    protected double smoothingCount;
+
+    public MutualInformationStats(double smoothingCount) {
+      this.classCounts = HashMultiset.<OUTCOME_T> create();
+      this.classConditionalCounts = HashBasedTable.<String, OUTCOME_T, Integer> create();
+      this.smoothingCount += smoothingCount;
+    }
+
+    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+      Integer count = this.classConditionalCounts.get(featureName, outcome);
+      if (count == null) {
+        count = 0;
+      }
+      this.classConditionalCounts.put(featureName, outcome, count + occurrences);
+      this.classCounts.add(outcome, occurrences);
+    }
+
+    public double mutualInformation(String featureName, OUTCOME_T outcome) {
+      // notation index of 0 means false, 1 mean true
+      int[] featureCounts = new int[2];
+      int[] outcomeCounts = new int[2];
+      int[][] featureOutcomeCounts = new int[2][2];
+
+      int n = this.classCounts.size();
+      featureCounts[1] = sum(this.classConditionalCounts.row(featureName).values());
+      featureCounts[0] = n - featureCounts[1];
+      outcomeCounts[1] = this.classCounts.count(outcome);
+      outcomeCounts[0] = n - outcomeCounts[1];
+
+      featureOutcomeCounts[1][1] = this.classConditionalCounts.contains(featureName, outcome)
+          ? this.classConditionalCounts.get(featureName, outcome)
+          : 0;
+      featureOutcomeCounts[1][0] = featureCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][1] = outcomeCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][0] = n - featureCounts[1] - outcomeCounts[1]
+          + featureOutcomeCounts[1][1];
+
+      double information = 0.0;
+      for (int nFeature = 0; nFeature <= 1; nFeature++) {
+        for (int nOutcome = 0; nOutcome <= 1; nOutcome++) {
+          featureOutcomeCounts[nFeature][nOutcome] += smoothingCount;
+          information += (double) featureOutcomeCounts[nFeature][nOutcome]
+              / (double) n
+              * Math.log(((double) n * featureOutcomeCounts[nFeature][nOutcome])
+                  / ((double) featureCounts[nFeature] * outcomeCounts[nOutcome]));
+        }
+      }
+
+      return information;
+    }
+
+    private static int sum(Collection<Integer> values) {
+      int total = 0;
+      for (int v : values) {
+        total += v;
+      }
+      return total;
+    }
+
+    public void save(URI outputURI) throws IOException {
+      File out = new File(outputURI);
+      BufferedWriter writer = null;
+      writer = new BufferedWriter(new FileWriter(out));
+
+      // Write out header
+      writer.append("Mutual Information Data\n");
+      writer.append("Feature\t");
+      writer.append(Joiner.on("\t").join(this.classConditionalCounts.columnKeySet()));
+      writer.append("\n");
+
+      // Write out Mutual Information data
+      for (String featureName : this.classConditionalCounts.rowKeySet()) {
+        writer.append(featureName);
+        for (OUTCOME_T outcome : this.classConditionalCounts.columnKeySet()) {
+          writer.append("\t");
+          writer.append(String.format("%f", this.mutualInformation(featureName, outcome)));
+        }
+        writer.append("\n");
+      }
+      writer.append("\n");
+      writer.append(this.classConditionalCounts.toString());
+      writer.close();
+    }
+
+    public Function<String, Double> getScoreFunction(final CombineScoreMethod combineScoreMethod) {
+      return new Function<String, Double>() {
+
+        @Override
+        public Double apply(String featureName) {
+          Set<OUTCOME_T> outcomes = classConditionalCounts.columnKeySet();
+          Map<OUTCOME_T, Double> featureOutcomeMI = Maps.newHashMap();
+          for (OUTCOME_T outcome : outcomes) {
+            featureOutcomeMI.put(outcome, mutualInformation(featureName, outcome));
+          }
+          return combineScoreMethod.apply(featureOutcomeMI);
+        }
+      };
+    }
+  }
+
+  private MutualInformationStats<OUTCOME_T> mutualInfoStats;
+
+  private int numFeatures;
+
+  private CombineScoreMethod combineScoreMethod;
+
+  private double smoothingCount;
+
+  public MutualInformationFeatureSelection(String name) {
+    this(name, CombineScoreMethod.MAX, 1.0, 10);
+  }
+
+  public MutualInformationFeatureSelection(String name, int numFeatures) {
+    this(name, CombineScoreMethod.MAX, 1.0, numFeatures);
+  }
+
+  public MutualInformationFeatureSelection(
+      String name,
+      CombineScoreMethod combineScoreMethod,
+      double smoothingCount,
+      int numFeatures) {
+    super(name);
+    this.combineScoreMethod = combineScoreMethod;
+    this.smoothingCount = smoothingCount;
+    this.numFeatures = numFeatures;
+  }
+
+  @Override
+  public void train(Iterable<Instance<OUTCOME_T>> instances) {
+    // aggregate statistics for all features and classes
+    this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
+    for (Instance<OUTCOME_T> instance : instances) {
+      OUTCOME_T outcome = instance.getOutcome();
+      for (Feature feature : instance.getFeatures()) {
+        if (this.isTransformable(feature)) {
+          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+            mutualInfoStats.update(this.getFeatureName(untransformedFeature), outcome, 1);
+          }
+        }
+      }
+    }
+
+    // sort features by mutual information score
+    Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
+    Function<String, Double> scoreFunction = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+    Ordering<String> ordering = Ordering.natural().onResultOf(scoreFunction).reverse();
+
+    // keep only the top N features
+    this.selectedFeatureNames = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(
+        0,
+        this.numFeatures));
+    this.isTrained = true;
+  }
+
+  @Override
+  public void save(URI uri) throws IOException {
+    if (!this.isTrained) {
+      throw new IOException("MutualInformationFeatureExtractor: Cannot save before training.");
+    }
+    File out = new File(uri);
+    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+    writer.append("CombineScoreType\t");
+    writer.append(this.combineScoreMethod.toString());
+    writer.append('\n');
+
+    for (String featureName : this.selectedFeatureNames) {
+      writer.append(featureName);
+      writer.append('\n');
+    }
+
+    writer.close();
+  }
+
+  @Override
+  public void load(URI uri) throws IOException {
+    this.selectedFeatureNames = Sets.newLinkedHashSet();
+    File in = new File(uri);
+    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+    // First line specifies the combine utility type
+    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\t")[1]);
+
+    // The rest of the lines are feature + selection scores
+    String line = null;
+    int n = 0;
+    while ((line = reader.readLine()) != null && n < this.numFeatures) {
+      String featureName = line.trim();
+      this.selectedFeatureNames.add(featureName);
+      n++;
+    }
+
+    reader.close();
+    this.isTrained = true;
+  }
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Mon Jul 15 19:51:22 2013
@@ -19,7 +19,9 @@
 package org.apache.ctakes.assertion.eval;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -31,6 +33,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.AlternateCuePhraseAnnotator;
 import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine;
 import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
@@ -75,6 +78,9 @@ import org.apache.uima.resource.metadata
 import org.apache.uima.util.CasCopier;
 import org.cleartk.classifier.DataWriter;
 import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
 import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
 import org.cleartk.classifier.jar.GenericJarClassifierFactory;
 import org.cleartk.classifier.jar.JarClassifierBuilder;
@@ -229,6 +235,12 @@ private static Logger logger = Logger.ge
     		" as the annotator class itself, since ytex is under a different license than Apache cTAKES.",
     		required = false)
     public boolean useYtexNegation;
+
+    @Option(
+    		name = "--feature-selection",
+    		usage = "Takes an argument: {c,m} corresponding to Chi-square or Mutual Information-based feature selection",
+    		required = false)
+    public String featureSelectionAlgorithm = null;
   }
   
   protected ArrayList<String> annotationTypes;
@@ -569,6 +581,15 @@ public static void printScore(Map<String
 //    AnalysisEngineDescription cuePhraseLookupAnnotator =
 //        AnalysisEngineFactory.createAnalysisEngineDescription("org/apache/ctakes/dictionary/lookup/AssertionCuePhraseDictionaryLookupAnnotator");
 //    builder.add(cuePhraseLookupAnnotator);
+
+    // Set up Feature Selection parameters
+    Float featureSelectionThreshold = 0f;
+    Class<? extends DataWriter> dataWriterClassFirstPass = getDataWriterClass(); 
+    if (options.featureSelectionAlgorithm!=null) {
+    	featureSelectionThreshold = .1f;
+    }
+    
+    // Add each assertion Analysis Engine to the pipeline!
     builder.add(AnalysisEngineFactory.createPrimitiveDescription(AlternateCuePhraseAnnotator.class, new Object[]{}));
     
     if (!options.ignorePolarity)
@@ -585,9 +606,13 @@ public static void printScore(Map<String
 //    				CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //    				this.dataWriterFactoryClass.getName(),
     				DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-    				this.dataWriterClass,
+    				dataWriterClassFirstPass,
     				DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-    				new File(directory, "polarity").getPath()
+    				new File(directory, "polarity").getPath(),
+    				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+    				PolarityCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "polarity")),
+    				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+    				featureSelectionThreshold
     				);
     		builder.add(polarityAnnotator);
     	}
@@ -603,9 +628,13 @@ public static void printScore(Map<String
 //	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //	        this.dataWriterFactoryClass.getName(),
           DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-          this.dataWriterClass,
+			dataWriterClassFirstPass,
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-	        new File(directory, "conditional").getPath()
+	        new File(directory, "conditional").getPath(),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+			ConditionalCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "conditional")),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+			featureSelectionThreshold
 	        );
 	    builder.add(conditionalAnnotator);
     }
@@ -620,9 +649,13 @@ public static void printScore(Map<String
 //	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //	        this.dataWriterFactoryClass.getName(),
           DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-          this.dataWriterClass,
+			dataWriterClassFirstPass,
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-	        new File(directory, "uncertainty").getPath()
+	        new File(directory, "uncertainty").getPath(),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+			UncertaintyCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "uncertainty")),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+			featureSelectionThreshold
 	        );
 	    builder.add(uncertaintyAnnotator);
     }
@@ -637,9 +670,13 @@ public static void printScore(Map<String
 //	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //	        this.dataWriterFactoryClass.getName(),
           DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-          this.dataWriterClass,
+			dataWriterClassFirstPass,
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-	        new File(directory, "subject").getPath()
+	        new File(directory, "subject").getPath(),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+			SubjectCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "subject")),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+			featureSelectionThreshold
 	        );
 	    builder.add(subjectAnnotator);
     }
@@ -654,9 +691,13 @@ public static void printScore(Map<String
 //		    CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //		    this.dataWriterFactoryClass.getName(),
         DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-        this.dataWriterClass,
+			dataWriterClassFirstPass,
 		    DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-		    new File(directory, "generic").getPath()
+		    new File(directory, "generic").getPath(),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+			GenericCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "generic")),
+			AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+			featureSelectionThreshold
 		    );
 		builder.add(genericAnnotator);
     }
@@ -671,9 +712,13 @@ public static void printScore(Map<String
 //    			CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 //    			this.dataWriterFactoryClass.getName(),
           DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
-          this.dataWriterClass,
+			dataWriterClassFirstPass,
     			DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
-    			new File(directory, "historyOf").getPath()
+    			new File(directory, "historyOf").getPath(),
+				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+				HistoryCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "historyOf")),
+				AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+				featureSelectionThreshold
     			);
     	builder.add(historyAnnotator);
     }
@@ -699,7 +744,7 @@ public static void printScore(Map<String
     for (String currentAssertionAttribute : annotationTypes)
     {
     	File currentDirectory = new File(directory, currentAssertionAttribute);
-    	JarClassifierBuilder.trainAndPackage(currentDirectory, trainingArguments);
+    	trainAndPackage(currentAssertionAttribute, currentDirectory, trainingArguments);
     }
     //hider.restoreOutput();
   }
@@ -907,6 +952,78 @@ public static void printScore(Map<String
     return map;
   }
 
+  protected void trainAndPackage(String currentAssertionAttribute, File directory, String[] arguments) throws Exception {
+	  if (options.featureSelectionAlgorithm!=null) {
+//		  InstanceDataWriter.INSTANCES_OUTPUT_FILENAME = "training-data.liblinear";
+		  // Extracting features and writing instances
+		  Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+
+		  // Collect MinMax stats for feature normalization
+		  FeatureSelection<String> featureSelection; 
+		  if (currentAssertionAttribute.equals("polarity")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = PolarityCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(PolarityCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else if (currentAssertionAttribute.equals("uncertainty")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = UncertaintyCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(UncertaintyCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else if (currentAssertionAttribute.equals("conditional")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = ConditionalCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(ConditionalCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else if (currentAssertionAttribute.equals("subject")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = SubjectCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(SubjectCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else if (currentAssertionAttribute.equals("generic")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = GenericCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(GenericCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else if (currentAssertionAttribute.equals("historyOf")) {
+			  // TODO: parameterize the thresholds
+			  featureSelection = HistoryCleartkAnalysisEngine.createFeatureSelection(1f);
+			  featureSelection.train(instances);
+			  featureSelection.save(HistoryCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+		  }
+		  else {
+			  featureSelection = null;
+		  }
+
+
+	      // now write in the libsvm format
+//	      LIBLINEARStringOutcomeDataWriter dataWriter = new LIBLINEARStringOutcomeDataWriter(directory);
+		  Constructor c = this.dataWriterClass.getConstructor(File.class);
+	      DataWriter dataWriter = (DataWriter) c.newInstance(directory);
+	      
+	      // try filtering
+	      for (Instance<String> instance : instances) {
+	    	  dataWriter.write(featureSelection.transform(instance));
+	      }
+	      dataWriter.finish();
+	  }
+
+	  // train models based on instances
+	  JarClassifierBuilder.trainAndPackage(directory, "-c", "0.05");
+  }
+  
+  protected Class<? extends DataWriter> getDataWriterClass()
+      throws ResourceInitializationException {
+    return (options.featureSelectionAlgorithm!=null)
+        ? InstanceDataWriter.class
+        : LIBLINEARStringOutcomeDataWriter.class;
+  }
+  
   private static boolean DEBUG = false;
   private static void printViewNames(String message, JCas jcas) {
 	

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,31 @@
+package org.apache.ctakes.assertion.eval;
+
+import java.io.File;
+
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
+
+public class TestFeatureSelection {
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		
+		File directory = new File("/Users/m081914/work/sharpattr/ctakes/ctakes-assertion-res/resources/model/sharptrain-xval/fold_0/polarity");
+		
+		InstanceDataWriter.INSTANCES_OUTPUT_FILENAME = "training-data.liblinear";
+		// Extracting features and writing instances
+		Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+		
+		FeatureSelection<String> featureSelection; 
+		featureSelection = PolarityCleartkAnalysisEngine.createFeatureSelection(1f);
+		featureSelection.train(instances);
+//		featureSelection.save(PolarityCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+
+	}
+
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,16 +18,17 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
 
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.zoner.types.Zone;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.temporary.assertion.AssertionCuePhraseAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
@@ -39,7 +40,6 @@ import org.apache.uima.analysis_engine.A
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.classifier.CleartkAnnotator;
 import org.cleartk.classifier.Feature;
@@ -86,11 +86,30 @@ public abstract class AssertionCleartkAn
       mandatory = false,
       description = "probability that a default example should be retained for training")
   protected double probabilityOfKeepingADefaultExample = 1.0;
+
+  public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection"; // Accurate name? Actually uses the threshold, right?
+
+  @ConfigurationParameter(
+		  name = PARAM_FEATURE_SELECTION_THRESHOLD,
+		  mandatory = false,
+		  description = "the Chi-squared threshold at which features should be removed")
+  protected Float featureSelectionThreshold = 0f;
+
+  public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+  @ConfigurationParameter(
+      mandatory = false,
+      name = PARAM_FEATURE_SELECTION_URI,
+      description = "provides a URI where the feature selection data will be written")
+  protected URI featureSelectionURI;
   
   protected Random coin = new Random(0);
 
+  protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
   protected String lastLabel;
   
+  
 /* DEPRECATED: STW 2013/03/28.  Use DependencyUtility:getNominalHeadNode(jCas,annotation) instead */
 //  public ConllDependencyNode findAnnotationHead(JCas jcas, Annotation annotation) {
 //		
@@ -117,9 +136,17 @@ public abstract class AssertionCleartkAn
   protected List<CleartkExtractor> tokenCleartkExtractors;
   protected List<SimpleFeatureExtractor> entityFeatureExtractors;
   protected CleartkExtractor cuePhraseInWindowExtractor;
+
+  protected FeatureSelection<String> featureSelection;
   
+  public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;
+
+  protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
+//  public abstract FeatureSelection<String> createFeatureSelection(double threshold);
+//  public abstract URI createFeatureSelectionURI(File outputDirectoryName);
+
   @Override
-@SuppressWarnings("deprecation")
+  @SuppressWarnings("deprecation")
   public void initialize(UimaContext context) throws ResourceInitializationException {
     super.initialize(context);
     
@@ -204,9 +231,6 @@ public abstract class AssertionCleartkAn
     
   }
 
-  public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;
-
-
   @Override
   public void process(JCas jCas) throws AnalysisEngineProcessException
   {
@@ -348,6 +372,7 @@ public abstract class AssertionCleartkAn
       }
       
       List<Feature> feats = instance.getFeatures();
+//      List<Feature> lcFeats = new ArrayList<Feature>();
       
       for(Feature feat : feats){
     	  if(feat.getName() != null && (feat.getName().startsWith("TreeFrag") || feat.getName().startsWith("WORD") || feat.getName().startsWith("NEG"))) continue;
@@ -355,9 +380,21 @@ public abstract class AssertionCleartkAn
     		  feat.setValue(((String)feat.getValue()).toLowerCase());
     	  }
       }
-      
+
+      // grab the output label
       setClassLabel(entityOrEventMention, instance);
-      
+
+      if (this.isTraining()) {
+    	  // apply feature selection, if necessary
+    	  if (this.featureSelection != null) {
+    		  feats = this.featureSelection.transform(feats);
+    	  }
+
+    	  // ensures that the (possibly) transformed feats are used
+    	  if (instance.getOutcome()!=null) {
+    		  this.dataWriter.write(new Instance<String>(instance.getOutcome(),feats));
+    	  }
+      }
     }
     
   }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,11 +18,17 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.classifier.Instance;
 
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 
 public class ConditionalCleartkAnalysisEngine extends
@@ -32,6 +38,8 @@ public class ConditionalCleartkAnalysisE
 	public void initialize(UimaContext context) throws ResourceInitializationException {
 		super.initialize(context);
 		probabilityOfKeepingADefaultExample = 0.1;
+		initializeFeatureSelection();
+
 	}
 	
 	@Override
@@ -47,7 +55,7 @@ public class ConditionalCleartkAnalysisE
 	        	return;
 	        }
 	        instance.setOutcome(conditional);
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 
 	      } else
 	      {
@@ -63,4 +71,30 @@ public class ConditionalCleartkAnalysisE
 	        entityOrEventMention.setConditional(conditional);
 	      }
 	}
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
+
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+	  
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
 import java.util.ArrayList;
 
 import org.apache.ctakes.assertion.attributes.features.GenericFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.uima.UimaContext;
@@ -49,6 +54,7 @@ public class GenericCleartkAnalysisEngin
 //		} else {
 			initialize_generic_extractor();
 //		}
+			initializeFeatureSelection();
 
 	}
 
@@ -80,12 +86,37 @@ public class GenericCleartkAnalysisEngin
 	        	return;
 	        }
 	        instance.setOutcome(generic);
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 	      } else
 	      {
 	        String label = this.classifier.classify(instance.getFeatures());
 	        entityOrEventMention.setGeneric("1".equals(label));
 	      }
 	}
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
 
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+	  
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
 import java.util.ArrayList;
 
 import org.apache.ctakes.assertion.attributes.features.HistoryFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -50,6 +55,7 @@ public class HistoryCleartkAnalysisEngin
 //		} else {
 			initialize_history_extractor();
 //		}
+			initializeFeatureSelection();
 
 	}
 
@@ -83,11 +89,37 @@ public class HistoryCleartkAnalysisEngin
 	        }
 	                
 	        instance.setOutcome(String.valueOf(history));
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 	      } else
 	      {
 	        String label = this.classifier.classify(instance.getFeatures());
 	        entityOrEventMention.setHistoryOf(Integer.parseInt(label));
 	      }
 	}
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
+
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+	  
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,8 +18,12 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.net.URI;
 import java.util.ArrayList;
 
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveRightFragmentExtractor;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
@@ -50,6 +54,9 @@ public class PolarityCleartkAnalysisEngi
 		this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/polarity.txt"));
 		this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("AL_Polarity","org/apache/ctakes/assertion/models/sharpPolarityFrags.txt"));
 //		this.entityFeatureExtractors.add(new AboveRightFragmentExtractor("AR_Polarity","org/apache/ctakes/assertion/models/sharpArPolarityFrags.txt"));
+		
+		initializeFeatureSelection();
+
 	}
 
 	@Override
@@ -68,7 +75,7 @@ public class PolarityCleartkAnalysisEngi
 	        	return;
 	        }
 	        instance.setOutcome(polarity);
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 	      } else
 	      {
 	        String label = this.classifier.classify(instance.getFeatures());
@@ -85,4 +92,30 @@ public class PolarityCleartkAnalysisEngi
 	        entityOrEventMention.setPolarity(polarity);
 	      }
 	}
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
+
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
 import java.util.ArrayList;
 
 import org.apache.ctakes.assertion.attributes.features.SubjectFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.log4j.Level;
 import org.apache.uima.UimaContext;
@@ -48,6 +53,7 @@ public class SubjectCleartkAnalysisEngin
 //		} else {
 			initialize_subject_extractor();
 //		}
+			initializeFeatureSelection();
 
 	}
 
@@ -77,7 +83,7 @@ public class SubjectCleartkAnalysisEngin
 	        	return;
 	        }
 	        instance.setOutcome(subj);
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 	        logger.log(Level.DEBUG,  String.format("[%s] expected: ''; actual: ''; features: %s",
 		      		  this.getClass().getSimpleName(),
 		      		  instance.toString()
@@ -90,5 +96,30 @@ public class SubjectCleartkAnalysisEngin
 	        logger.log(Level.DEBUG, "SUBJECT is being set on an IdentifiedAnnotation: "+label+" "+entityOrEventMention.getSubject());
 	      }
 	}
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
 
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+	
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,8 +18,13 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
 import java.util.ArrayList;
 
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -40,8 +45,11 @@ public class UncertaintyCleartkAnalysisE
 		}
 		this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/uncertainty.txt"));
 		this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("ALUncertainty", "org/apache/ctakes/assertion/models/sharpUncertaintyFrags.txt"));
+
+		initializeFeatureSelection();
+		
 	}
-	
+
 	@Override
 	public void setClassLabel(IdentifiedAnnotation entityOrEventMention, Instance<String> instance) throws AnalysisEngineProcessException {
 		if (this.isTraining())
@@ -54,7 +62,7 @@ public class UncertaintyCleartkAnalysisE
 	        	return;
 	        }
 	        instance.setOutcome(uncertainty);
-	        this.dataWriter.write(instance);
+//	        this.dataWriter.write(instance);
 	      } else
 	      {
 	        String label = this.classifier.classify(instance.getFeatures());
@@ -69,5 +77,32 @@ public class UncertaintyCleartkAnalysisE
 	        entityOrEventMention.setUncertainty(uncertainty);
 	      }
 	}
+	
+	public static FeatureSelection<String> createFeatureSelection(double threshold) {
+		return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+		//		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+	}
+
+	public static URI createFeatureSelectionURI(File outputDirectoryName) {
+		return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+	}
+	  
+	@Override
+	protected void initializeFeatureSelection() throws ResourceInitializationException {
+	    if (featureSelectionThreshold == 0) {
+	    	this.featureSelection = null;
+	    } else {
+	    	this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+	    }		
+	}
+	  
 
 }

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java Mon Jul 15 19:51:22 2013
@@ -19,6 +19,7 @@ public class CrossValidateAttributeModel
 			params.add("--train-dir"); 			params.add(AssertionConst.trainingDirectories.get(attribute));
 			params.add("--models-dir"); 		params.add(AssertionConst.modelDirectory);
 			params.add("--cross-validation"); 	params.add("5");
+			params.add("--feature-selection");	params.add("c");
 			
 			// Build up an "ignore" string
 			for (String ignoreAttribute : AssertionConst.annotationTypes) {

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java Mon Jul 15 19:51:22 2013
@@ -25,9 +25,10 @@ public class TrainAttributeModels {
 			params.add("--models-dir"); params.add(AssertionConst.modelDirectory);
 //			params.add("--evaluation-output-dir");	params.add(AssertionConst.evalOutputDir); 
 			params.add("--train-only"); 
+			params.add("--feature-selection");	params.add("c");
 			
 			// Build up an "ignore" string
-			for (String ignoreAttribute : AssertionConst.annotationTypes) {
+			for (String ignoreAttribute : AssertionConst.allAnnotationTypes) {
 				if (!ignoreAttribute.equals(attribute)) { 
 
 					if (ignoreAttribute.equals("historyOf")) {