You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by sw...@apache.org on 2013/07/15 21:51:22 UTC
svn commit: r1503438 - in
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion:
attributes/features/ attributes/features/selection/ eval/ medfacts/cleartk/
train/
Author: swu
Date: Mon Jul 15 19:51:22 2013
New Revision: 1503438
URL: http://svn.apache.org/r1503438
Log:
chi2 feature selection and lots of associated stuff
Added:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java (with props)
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java (with props)
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java (with props)
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java (with props)
Modified:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/GenericFeaturesExtractor.java Mon Jul 15 19:51:22 2013
@@ -51,7 +51,8 @@ public class GenericFeaturesExtractor im
// Pull in general dependency-based features -- externalize to another extractor?
ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
if (node!= null) {
- features.add(new Feature("DEPENDENCY_HEAD", node));
+ features.add(new Feature("DEPENDENCY_HEAD", node.getCoveredText()));
+ features.add(new Feature("DEPENDENCY_HEAD_deprel", node.getDeprel()));
}
HashMap<String, Boolean> featsMap = GenericAttributeClassifier.extract(jCas, arg);
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/SubjectFeaturesExtractor.java Mon Jul 15 19:51:22 2013
@@ -51,7 +51,8 @@ public class SubjectFeaturesExtractor im
// Pull in general dependency-based features -- externalize to another extractor?
ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
if (node!= null) {
- features.add(new Feature("DEPENDENCY_HEAD", node));
+ features.add(new Feature("DEPENDENCY_HEAD", node.getCoveredText()));
+ features.add(new Feature("DEPENDENCY_HEAD_deprel", node.getDeprel()));
}
HashMap<String, Boolean> featsMap = SubjectAttributeClassifier.extract(jCas, arg);
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,197 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Table;
+
+/**
+ *
+ * Selects features via Chi-squared statistics between the features extracted from its sub-extractor
+ * and the outcome values they are paired with in classification instances.
+ *
+ * @author Chen Lin
+ *
+ */
+public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
+
+ /**
+ * Helper class for aggregating and computing mutual Chi2 statistics
+ */
+ private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
+ protected Multiset<OUTCOME_T> classCounts;
+
+ protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+ public Chi2Scorer() {
+ this.classCounts = HashMultiset.<OUTCOME_T> create();
+ this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+ }
+
+ public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+ Integer count = this.featValueClassCount.get(featureName, outcome);
+ if (count == null) {
+ count = 0;
+ }
+ this.featValueClassCount.put(featureName, outcome, count + occurrences);
+ this.classCounts.add(outcome, occurrences);
+ }
+
+ public Double apply(String featureName) {
+ return this.score(featureName);
+ }
+
+ public double score(String featureName) {
+ // notation index of 0 means false, 1 mean true
+ // Contingency Table:
+ // | class1 | class2 | class3 | sum
+ // posi | | | | posiFeatCount
+ // nega | | | | negaFeatCount
+ // | outcnt1 | outcnt2 | outcnt3 | n
+
+ int numOfClass = this.classCounts.elementSet().size();
+ int[] posiOutcomeCounts = new int[numOfClass];
+ int[] outcomeCounts = new int[numOfClass];
+ int classId = 0;
+ int posiFeatCount = 0;
+ for (OUTCOME_T clas : this.classCounts.elementSet()) {
+ posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
+ ? this.featValueClassCount.get(featureName, clas)
+ : 0;
+ posiFeatCount += posiOutcomeCounts[classId];
+ outcomeCounts[classId] = this.classCounts.count(clas);
+ classId++;
+ }
+
+ int n = this.classCounts.size();
+ int negaFeatCount = n - posiFeatCount;
+
+ double chi2val = 0.0;
+
+ if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
+ // feature, degree of freedom = 0
+ return chi2val;
+ }
+
+ boolean yates = true;
+ for (int lbl = 0; lbl < numOfClass; lbl++) {
+ // for positive part of feature:
+ double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
+ if (expected > 0) {
+ double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
+ if (yates) { // apply Yate's correction
+ diff -= 0.5;
+ }
+ if (diff > 0)
+ chi2val += Math.pow(diff, 2) / expected;
+ }
+
+ // for negative part of feature:
+ expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
+ double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
+ if (expected > 0) {
+ double diff = Math.abs(observ - expected);
+ if (yates) { // apply Yate's correction
+ diff -= 0.5;
+ }
+ if (diff > 0)
+ chi2val += Math.pow(diff, 2) / expected;
+ }
+ }
+
+ return chi2val;
+ }
+ }
+
+ private double chi2Threshold;
+
+ private Chi2Scorer<OUTCOME_T> chi2Function;
+
+ public Chi2FeatureSelection(String name) {
+ this(name, 0.0);
+ }
+
+ public Chi2FeatureSelection(String name, double threshold) {
+ super(name);
+ this.chi2Threshold = threshold;
+ }
+
+ @Override
+ public boolean apply(Feature feature) {
+ return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+ }
+
+ @Override
+ public void train(Iterable<Instance<OUTCOME_T>> instances) {
+ // aggregate statistics for all features
+ this.chi2Function = new Chi2Scorer<OUTCOME_T>();
+ for (Instance<OUTCOME_T> instance : instances) {
+ OUTCOME_T outcome = instance.getOutcome();
+ for (Feature feature : instance.getFeatures()) {
+ if (this.isTransformable(feature)) {
+ for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+ this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
+ }
+ }
+ }
+ }
+ // keep only large chi2 valued features
+ this.selectedFeatureNames = Sets.newHashSet();
+ for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
+ if (this.chi2Function.score(featureName) > this.chi2Threshold) {
+ this.selectedFeatureNames.add(featureName);
+ }
+ }
+
+ this.isTrained = true;
+ }
+
+ @Override
+ public void save(URI uri) throws IOException {
+ if (!this.isTrained) {
+ throw new IllegalStateException("Cannot save before training");
+ }
+ File out = new File(uri);
+ BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+ Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
+ for (String feature : ordering.immutableSortedCopy(this.selectedFeatureNames)) {
+ writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
+ }
+
+ writer.close();
+ }
+
+ @Override
+ public void load(URI uri) throws IOException {
+ this.selectedFeatureNames = Sets.newLinkedHashSet();
+ File in = new File(uri);
+ BufferedReader reader = new BufferedReader(new FileReader(in));
+
+ // The lines are <feature-name>\t<feature-score>
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ String[] featureValuePair = line.split("\t");
+ this.selectedFeatureNames.add(featureValuePair[0]);
+ }
+
+ reader.close();
+ this.isTrained = true;
+
+ }
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/Chi2FeatureSelection.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,64 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.Lists;
+
+public abstract class FeatureSelection<OUTCOME_T> extends
+ TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
+
+ protected boolean isTrained;
+
+ protected Set<String> selectedFeatureNames;
+
+ public FeatureSelection(String name) {
+ super(name);
+ this.isTrained = false;
+ }
+
+ @Override
+ public boolean apply(Feature feature) {
+ return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+ }
+
+ @Override
+ public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
+ List<Feature> features = new ArrayList<Feature>();
+ for (Feature feature : instance.getFeatures()) {
+ if (this.isTransformable(feature)) {
+ // Filter down to selected features
+ features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
+ } else {
+ // Pass non-relevant features through w/o filtering
+ features.add(feature);
+ }
+ }
+ return new Instance<OUTCOME_T>(instance.getOutcome(), features);
+ }
+
+ public List<Feature> transform(List<Feature> features) {
+ List<Feature> results = Lists.newArrayList();
+ if (this.isTrained) {
+ results.addAll(Collections2.filter(features, this));
+ } else {
+ results.add(new TransformableFeature(this.name, features));
+ }
+ return results;
+ }
+
+ protected String getFeatureName(Feature feature) {
+ String featureName = feature.getName();
+ Object featureValue = feature.getValue();
+ return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+ }
+
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/FeatureSelection.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,267 @@
+package org.apache.ctakes.assertion.attributes.features.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Table;
+
+/**
+ * <br>
+ * Copyright (c) 2007-2012, Regents of the University of Colorado <br>
+ * All rights reserved.
+ * <p>
+ *
+ * Selects features via mutual information statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ *
+ * @author Lee Becker
+ *
+ */
+public class MutualInformationFeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
+
+ /**
+ * Specifies how scores for each outcome should be combined/aggregated into a single score
+ */
+ public static enum CombineScoreMethod implements Function<Map<?, Double>, Double> {
+ AVERAGE {
+ public Double apply(Map<?, Double> input) {
+ Collection<Double> scores = input.values();
+ int size = scores.size();
+ double total = 0;
+ for (Double score : scores) {
+ total += score;
+ }
+ return total / size;
+ }
+ },
+ MAX {
+ @Override
+ public Double apply(Map<?, Double> input) {
+ return Ordering.natural().max(input.values());
+ }
+ }
+ }
+
+ /**
+ * Helper class for aggregating and computing mutual information statistics
+ */
+ public static class MutualInformationStats<OUTCOME_T> {
+ protected Multiset<OUTCOME_T> classCounts;
+
+ protected Table<String, OUTCOME_T, Integer> classConditionalCounts;
+
+ protected double smoothingCount;
+
+ public MutualInformationStats(double smoothingCount) {
+ this.classCounts = HashMultiset.<OUTCOME_T> create();
+ this.classConditionalCounts = HashBasedTable.<String, OUTCOME_T, Integer> create();
+ this.smoothingCount += smoothingCount;
+ }
+
+ public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+ Integer count = this.classConditionalCounts.get(featureName, outcome);
+ if (count == null) {
+ count = 0;
+ }
+ this.classConditionalCounts.put(featureName, outcome, count + occurrences);
+ this.classCounts.add(outcome, occurrences);
+ }
+
+ public double mutualInformation(String featureName, OUTCOME_T outcome) {
+ // notation index of 0 means false, 1 mean true
+ int[] featureCounts = new int[2];
+ int[] outcomeCounts = new int[2];
+ int[][] featureOutcomeCounts = new int[2][2];
+
+ int n = this.classCounts.size();
+ featureCounts[1] = sum(this.classConditionalCounts.row(featureName).values());
+ featureCounts[0] = n - featureCounts[1];
+ outcomeCounts[1] = this.classCounts.count(outcome);
+ outcomeCounts[0] = n - outcomeCounts[1];
+
+ featureOutcomeCounts[1][1] = this.classConditionalCounts.contains(featureName, outcome)
+ ? this.classConditionalCounts.get(featureName, outcome)
+ : 0;
+ featureOutcomeCounts[1][0] = featureCounts[1] - featureOutcomeCounts[1][1];
+ featureOutcomeCounts[0][1] = outcomeCounts[1] - featureOutcomeCounts[1][1];
+ featureOutcomeCounts[0][0] = n - featureCounts[1] - outcomeCounts[1]
+ + featureOutcomeCounts[1][1];
+
+ double information = 0.0;
+ for (int nFeature = 0; nFeature <= 1; nFeature++) {
+ for (int nOutcome = 0; nOutcome <= 1; nOutcome++) {
+ featureOutcomeCounts[nFeature][nOutcome] += smoothingCount;
+ information += (double) featureOutcomeCounts[nFeature][nOutcome]
+ / (double) n
+ * Math.log(((double) n * featureOutcomeCounts[nFeature][nOutcome])
+ / ((double) featureCounts[nFeature] * outcomeCounts[nOutcome]));
+ }
+ }
+
+ return information;
+ }
+
+ private static int sum(Collection<Integer> values) {
+ int total = 0;
+ for (int v : values) {
+ total += v;
+ }
+ return total;
+ }
+
+ public void save(URI outputURI) throws IOException {
+ File out = new File(outputURI);
+ BufferedWriter writer = null;
+ writer = new BufferedWriter(new FileWriter(out));
+
+ // Write out header
+ writer.append("Mutual Information Data\n");
+ writer.append("Feature\t");
+ writer.append(Joiner.on("\t").join(this.classConditionalCounts.columnKeySet()));
+ writer.append("\n");
+
+ // Write out Mutual Information data
+ for (String featureName : this.classConditionalCounts.rowKeySet()) {
+ writer.append(featureName);
+ for (OUTCOME_T outcome : this.classConditionalCounts.columnKeySet()) {
+ writer.append("\t");
+ writer.append(String.format("%f", this.mutualInformation(featureName, outcome)));
+ }
+ writer.append("\n");
+ }
+ writer.append("\n");
+ writer.append(this.classConditionalCounts.toString());
+ writer.close();
+ }
+
+ public Function<String, Double> getScoreFunction(final CombineScoreMethod combineScoreMethod) {
+ return new Function<String, Double>() {
+
+ @Override
+ public Double apply(String featureName) {
+ Set<OUTCOME_T> outcomes = classConditionalCounts.columnKeySet();
+ Map<OUTCOME_T, Double> featureOutcomeMI = Maps.newHashMap();
+ for (OUTCOME_T outcome : outcomes) {
+ featureOutcomeMI.put(outcome, mutualInformation(featureName, outcome));
+ }
+ return combineScoreMethod.apply(featureOutcomeMI);
+ }
+ };
+ }
+ }
+
+ private MutualInformationStats<OUTCOME_T> mutualInfoStats;
+
+ private int numFeatures;
+
+ private CombineScoreMethod combineScoreMethod;
+
+ private double smoothingCount;
+
+ public MutualInformationFeatureSelection(String name) {
+ this(name, CombineScoreMethod.MAX, 1.0, 10);
+ }
+
+ public MutualInformationFeatureSelection(String name, int numFeatures) {
+ this(name, CombineScoreMethod.MAX, 1.0, numFeatures);
+ }
+
+ public MutualInformationFeatureSelection(
+ String name,
+ CombineScoreMethod combineScoreMethod,
+ double smoothingCount,
+ int numFeatures) {
+ super(name);
+ this.combineScoreMethod = combineScoreMethod;
+ this.smoothingCount = smoothingCount;
+ this.numFeatures = numFeatures;
+ }
+
+ @Override
+ public void train(Iterable<Instance<OUTCOME_T>> instances) {
+ // aggregate statistics for all features and classes
+ this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
+ for (Instance<OUTCOME_T> instance : instances) {
+ OUTCOME_T outcome = instance.getOutcome();
+ for (Feature feature : instance.getFeatures()) {
+ if (this.isTransformable(feature)) {
+ for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+ mutualInfoStats.update(this.getFeatureName(untransformedFeature), outcome, 1);
+ }
+ }
+ }
+ }
+
+ // sort features by mutual information score
+ Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
+ Function<String, Double> scoreFunction = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+ Ordering<String> ordering = Ordering.natural().onResultOf(scoreFunction).reverse();
+
+ // keep only the top N features
+ this.selectedFeatureNames = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(
+ 0,
+ this.numFeatures));
+ this.isTrained = true;
+ }
+
+ @Override
+ public void save(URI uri) throws IOException {
+ if (!this.isTrained) {
+ throw new IOException("MutualInformationFeatureExtractor: Cannot save before training.");
+ }
+ File out = new File(uri);
+ BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+ writer.append("CombineScoreType\t");
+ writer.append(this.combineScoreMethod.toString());
+ writer.append('\n');
+
+ for (String featureName : this.selectedFeatureNames) {
+ writer.append(featureName);
+ writer.append('\n');
+ }
+
+ writer.close();
+ }
+
+ @Override
+ public void load(URI uri) throws IOException {
+ this.selectedFeatureNames = Sets.newLinkedHashSet();
+ File in = new File(uri);
+ BufferedReader reader = new BufferedReader(new FileReader(in));
+
+ // First line specifies the combine utility type
+ this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\t")[1]);
+
+ // The rest of the lines are feature + selection scores
+ String line = null;
+ int n = 0;
+ while ((line = reader.readLine()) != null && n < this.numFeatures) {
+ String featureName = line.trim();
+ this.selectedFeatureNames.add(featureName);
+ n++;
+ }
+
+ reader.close();
+ this.isTrained = true;
+ }
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/features/selection/MutualInformationFeatureSelection.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java Mon Jul 15 19:51:22 2013
@@ -19,7 +19,9 @@
package org.apache.ctakes.assertion.eval;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -31,6 +33,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.medfacts.cleartk.AlternateCuePhraseAnnotator;
import org.apache.ctakes.assertion.medfacts.cleartk.AssertionCleartkAnalysisEngine;
import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
@@ -75,6 +78,9 @@ import org.apache.uima.resource.metadata
import org.apache.uima.util.CasCopier;
import org.cleartk.classifier.DataWriter;
import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
import org.cleartk.classifier.jar.GenericJarClassifierFactory;
import org.cleartk.classifier.jar.JarClassifierBuilder;
@@ -229,6 +235,12 @@ private static Logger logger = Logger.ge
" as the annotator class itself, since ytex is under a different license than Apache cTAKES.",
required = false)
public boolean useYtexNegation;
+
+ @Option(
+ name = "--feature-selection",
+ usage = "Takes an argument: {c,m} corresponding to Chi-square or Mutual Information-based feature selection",
+ required = false)
+ public String featureSelectionAlgorithm = null;
}
protected ArrayList<String> annotationTypes;
@@ -569,6 +581,15 @@ public static void printScore(Map<String
// AnalysisEngineDescription cuePhraseLookupAnnotator =
// AnalysisEngineFactory.createAnalysisEngineDescription("org/apache/ctakes/dictionary/lookup/AssertionCuePhraseDictionaryLookupAnnotator");
// builder.add(cuePhraseLookupAnnotator);
+
+ // Set up Feature Selection parameters
+ Float featureSelectionThreshold = 0f;
+ Class<? extends DataWriter> dataWriterClassFirstPass = getDataWriterClass();
+ if (options.featureSelectionAlgorithm!=null) {
+ featureSelectionThreshold = .1f;
+ }
+
+ // Add each assertion Analysis Engine to the pipeline!
builder.add(AnalysisEngineFactory.createPrimitiveDescription(AlternateCuePhraseAnnotator.class, new Object[]{}));
if (!options.ignorePolarity)
@@ -585,9 +606,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "polarity").getPath()
+ new File(directory, "polarity").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ PolarityCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "polarity")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(polarityAnnotator);
}
@@ -603,9 +628,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "conditional").getPath()
+ new File(directory, "conditional").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ ConditionalCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "conditional")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(conditionalAnnotator);
}
@@ -620,9 +649,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "uncertainty").getPath()
+ new File(directory, "uncertainty").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ UncertaintyCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "uncertainty")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(uncertaintyAnnotator);
}
@@ -637,9 +670,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "subject").getPath()
+ new File(directory, "subject").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ SubjectCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "subject")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(subjectAnnotator);
}
@@ -654,9 +691,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "generic").getPath()
+ new File(directory, "generic").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ GenericCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "generic")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(genericAnnotator);
}
@@ -671,9 +712,13 @@ public static void printScore(Map<String
// CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
// this.dataWriterFactoryClass.getName(),
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- this.dataWriterClass,
+ dataWriterClassFirstPass,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- new File(directory, "historyOf").getPath()
+ new File(directory, "historyOf").getPath(),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_URI,
+ HistoryCleartkAnalysisEngine.createFeatureSelectionURI(new File(directory, "historyOf")),
+ AssertionCleartkAnalysisEngine.PARAM_FEATURE_SELECTION_THRESHOLD,
+ featureSelectionThreshold
);
builder.add(historyAnnotator);
}
@@ -699,7 +744,7 @@ public static void printScore(Map<String
for (String currentAssertionAttribute : annotationTypes)
{
File currentDirectory = new File(directory, currentAssertionAttribute);
- JarClassifierBuilder.trainAndPackage(currentDirectory, trainingArguments);
+ trainAndPackage(currentAssertionAttribute, currentDirectory, trainingArguments);
}
//hider.restoreOutput();
}
@@ -907,6 +952,78 @@ public static void printScore(Map<String
return map;
}
+ protected void trainAndPackage(String currentAssertionAttribute, File directory, String[] arguments) throws Exception {
+ if (options.featureSelectionAlgorithm!=null) {
+// InstanceDataWriter.INSTANCES_OUTPUT_FILENAME = "training-data.liblinear";
+ // Extracting features and writing instances
+ Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+
+ // Collect MinMax stats for feature normalization
+ FeatureSelection<String> featureSelection;
+ if (currentAssertionAttribute.equals("polarity")) {
+ // TODO: parameterize the thresholds
+ featureSelection = PolarityCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(PolarityCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else if (currentAssertionAttribute.equals("uncertainty")) {
+ // TODO: parameterize the thresholds
+ featureSelection = UncertaintyCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(UncertaintyCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else if (currentAssertionAttribute.equals("conditional")) {
+ // TODO: parameterize the thresholds
+ featureSelection = ConditionalCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(ConditionalCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else if (currentAssertionAttribute.equals("subject")) {
+ // TODO: parameterize the thresholds
+ featureSelection = SubjectCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(SubjectCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else if (currentAssertionAttribute.equals("generic")) {
+ // TODO: parameterize the thresholds
+ featureSelection = GenericCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(GenericCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else if (currentAssertionAttribute.equals("historyOf")) {
+ // TODO: parameterize the thresholds
+ featureSelection = HistoryCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+ featureSelection.save(HistoryCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+ }
+ else {
+ featureSelection = null;
+ }
+
+
+ // now write in the libsvm format
+// LIBLINEARStringOutcomeDataWriter dataWriter = new LIBLINEARStringOutcomeDataWriter(directory);
+ Constructor c = this.dataWriterClass.getConstructor(File.class);
+ DataWriter dataWriter = (DataWriter) c.newInstance(directory);
+
+ // try filtering
+ for (Instance<String> instance : instances) {
+ dataWriter.write(featureSelection.transform(instance));
+ }
+ dataWriter.finish();
+ }
+
+ // train models based on instances
+ JarClassifierBuilder.trainAndPackage(directory, "-c", "0.05");
+ }
+
+ protected Class<? extends DataWriter> getDataWriterClass()
+ throws ResourceInitializationException {
+ return (options.featureSelectionAlgorithm!=null)
+ ? InstanceDataWriter.class
+ : LIBLINEARStringOutcomeDataWriter.class;
+ }
+
private static boolean DEBUG = false;
private static void printViewNames(String message, JCas jcas) {
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java?rev=1503438&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java Mon Jul 15 19:51:22 2013
@@ -0,0 +1,31 @@
+package org.apache.ctakes.assertion.eval;
+
+import java.io.File;
+
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
+
+public class TestFeatureSelection {
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+ File directory = new File("/Users/m081914/work/sharpattr/ctakes/ctakes-assertion-res/resources/model/sharptrain-xval/fold_0/polarity");
+
+ InstanceDataWriter.INSTANCES_OUTPUT_FILENAME = "training-data.liblinear";
+ // Extracting features and writing instances
+ Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+
+ FeatureSelection<String> featureSelection;
+ featureSelection = PolarityCleartkAnalysisEngine.createFeatureSelection(1f);
+ featureSelection.train(instances);
+// featureSelection.save(PolarityCleartkAnalysisEngine.createFeatureSelectionURI(directory));
+
+ }
+
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/TestFeatureSelection.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,16 +18,17 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Random;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.zoner.types.Zone;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
import org.apache.ctakes.typesystem.type.temporary.assertion.AssertionCuePhraseAnnotation;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
@@ -39,7 +40,6 @@ import org.apache.uima.analysis_engine.A
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.classifier.CleartkAnnotator;
import org.cleartk.classifier.Feature;
@@ -86,11 +86,30 @@ public abstract class AssertionCleartkAn
mandatory = false,
description = "probability that a default example should be retained for training")
protected double probabilityOfKeepingADefaultExample = 1.0;
+
+ public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection"; // Accurate name? Actually uses the threshold, right?
+
+ @ConfigurationParameter(
+ name = PARAM_FEATURE_SELECTION_THRESHOLD,
+ mandatory = false,
+ description = "the Chi-squared threshold at which features should be removed")
+ protected Float featureSelectionThreshold = 0f;
+
+ public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+ @ConfigurationParameter(
+ mandatory = false,
+ name = PARAM_FEATURE_SELECTION_URI,
+ description = "provides a URI where the feature selection data will be written")
+ protected URI featureSelectionURI;
protected Random coin = new Random(0);
+ protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
protected String lastLabel;
+
/* DEPRECATED: STW 2013/03/28. Use DependencyUtility:getNominalHeadNode(jCas,annotation) instead */
// public ConllDependencyNode findAnnotationHead(JCas jcas, Annotation annotation) {
//
@@ -117,9 +136,17 @@ public abstract class AssertionCleartkAn
protected List<CleartkExtractor> tokenCleartkExtractors;
protected List<SimpleFeatureExtractor> entityFeatureExtractors;
protected CleartkExtractor cuePhraseInWindowExtractor;
+
+ protected FeatureSelection<String> featureSelection;
+ public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;
+
+ protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
+// public abstract FeatureSelection<String> createFeatureSelection(double threshold);
+// public abstract URI createFeatureSelectionURI(File outputDirectoryName);
+
@Override
-@SuppressWarnings("deprecation")
+ @SuppressWarnings("deprecation")
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
@@ -204,9 +231,6 @@ public abstract class AssertionCleartkAn
}
- public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;
-
-
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException
{
@@ -348,6 +372,7 @@ public abstract class AssertionCleartkAn
}
List<Feature> feats = instance.getFeatures();
+// List<Feature> lcFeats = new ArrayList<Feature>();
for(Feature feat : feats){
if(feat.getName() != null && (feat.getName().startsWith("TreeFrag") || feat.getName().startsWith("WORD") || feat.getName().startsWith("NEG"))) continue;
@@ -355,9 +380,21 @@ public abstract class AssertionCleartkAn
feat.setValue(((String)feat.getValue()).toLowerCase());
}
}
-
+
+ // grab the output label
setClassLabel(entityOrEventMention, instance);
-
+
+ if (this.isTraining()) {
+ // apply feature selection, if necessary
+ if (this.featureSelection != null) {
+ feats = this.featureSelection.transform(feats);
+ }
+
+ // ensures that the (possibly) transformed feats are used
+ if (instance.getOutcome()!=null) {
+ this.dataWriter.write(new Instance<String>(instance.getOutcome(),feats));
+ }
+ }
}
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/ConditionalCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,11 +18,17 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.classifier.Instance;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
public class ConditionalCleartkAnalysisEngine extends
@@ -32,6 +38,8 @@ public class ConditionalCleartkAnalysisE
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
probabilityOfKeepingADefaultExample = 0.1;
+ initializeFeatureSelection();
+
}
@Override
@@ -47,7 +55,7 @@ public class ConditionalCleartkAnalysisE
return;
}
instance.setOutcome(conditional);
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
} else
{
@@ -63,4 +71,30 @@ public class ConditionalCleartkAnalysisE
entityOrEventMention.setConditional(conditional);
}
}
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/GenericCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
import org.apache.ctakes.assertion.attributes.features.GenericFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.uima.UimaContext;
@@ -49,6 +54,7 @@ public class GenericCleartkAnalysisEngin
// } else {
initialize_generic_extractor();
// }
+ initializeFeatureSelection();
}
@@ -80,12 +86,37 @@ public class GenericCleartkAnalysisEngin
return;
}
instance.setOutcome(generic);
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
} else
{
String label = this.classifier.classify(instance.getFeatures());
entityOrEventMention.setGeneric("1".equals(label));
}
}
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/HistoryCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
import org.apache.ctakes.assertion.attributes.features.HistoryFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -50,6 +55,7 @@ public class HistoryCleartkAnalysisEngin
// } else {
initialize_history_extractor();
// }
+ initializeFeatureSelection();
}
@@ -83,11 +89,37 @@ public class HistoryCleartkAnalysisEngin
}
instance.setOutcome(String.valueOf(history));
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
} else
{
String label = this.classifier.classify(instance.getFeatures());
entityOrEventMention.setHistoryOf(Integer.parseInt(label));
}
}
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/PolarityCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,8 +18,12 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.net.URI;
import java.util.ArrayList;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveRightFragmentExtractor;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
@@ -50,6 +54,9 @@ public class PolarityCleartkAnalysisEngi
this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/polarity.txt"));
this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("AL_Polarity","org/apache/ctakes/assertion/models/sharpPolarityFrags.txt"));
// this.entityFeatureExtractors.add(new AboveRightFragmentExtractor("AR_Polarity","org/apache/ctakes/assertion/models/sharpArPolarityFrags.txt"));
+
+ initializeFeatureSelection();
+
}
@Override
@@ -68,7 +75,7 @@ public class PolarityCleartkAnalysisEngi
return;
}
instance.setOutcome(polarity);
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
} else
{
String label = this.classifier.classify(instance.getFeatures());
@@ -85,4 +92,30 @@ public class PolarityCleartkAnalysisEngi
entityOrEventMention.setPolarity(polarity);
}
}
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/SubjectCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,9 +18,14 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
import org.apache.ctakes.assertion.attributes.features.SubjectFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.log4j.Level;
import org.apache.uima.UimaContext;
@@ -48,6 +53,7 @@ public class SubjectCleartkAnalysisEngin
// } else {
initialize_subject_extractor();
// }
+ initializeFeatureSelection();
}
@@ -77,7 +83,7 @@ public class SubjectCleartkAnalysisEngin
return;
}
instance.setOutcome(subj);
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
logger.log(Level.DEBUG, String.format("[%s] expected: ''; actual: ''; features: %s",
this.getClass().getSimpleName(),
instance.toString()
@@ -90,5 +96,30 @@ public class SubjectCleartkAnalysisEngin
logger.log(Level.DEBUG, "SUBJECT is being set on an IdentifiedAnnotation: "+label+" "+entityOrEventMention.getSubject());
}
}
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/UncertaintyCleartkAnalysisEngine.java Mon Jul 15 19:51:22 2013
@@ -18,8 +18,13 @@
*/
package org.apache.ctakes.assertion.medfacts.cleartk;
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
import java.util.ArrayList;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
import org.apache.ctakes.assertion.medfacts.cleartk.extractors.ContextWordWindowExtractor;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -40,8 +45,11 @@ public class UncertaintyCleartkAnalysisE
}
this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/uncertainty.txt"));
this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("ALUncertainty", "org/apache/ctakes/assertion/models/sharpUncertaintyFrags.txt"));
+
+ initializeFeatureSelection();
+
}
-
+
@Override
public void setClassLabel(IdentifiedAnnotation entityOrEventMention, Instance<String> instance) throws AnalysisEngineProcessException {
if (this.isTraining())
@@ -54,7 +62,7 @@ public class UncertaintyCleartkAnalysisE
return;
}
instance.setOutcome(uncertainty);
- this.dataWriter.write(instance);
+// this.dataWriter.write(instance);
} else
{
String label = this.classifier.classify(instance.getFeatures());
@@ -69,5 +77,32 @@ public class UncertaintyCleartkAnalysisE
entityOrEventMention.setUncertainty(uncertainty);
}
}
+
+ public static FeatureSelection<String> createFeatureSelection(double threshold) {
+ return new Chi2FeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold);
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if (featureSelectionThreshold == 0) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection(this.featureSelectionThreshold);
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/CrossValidateAttributeModels.java Mon Jul 15 19:51:22 2013
@@ -19,6 +19,7 @@ public class CrossValidateAttributeModel
params.add("--train-dir"); params.add(AssertionConst.trainingDirectories.get(attribute));
params.add("--models-dir"); params.add(AssertionConst.modelDirectory);
params.add("--cross-validation"); params.add("5");
+ params.add("--feature-selection"); params.add("c");
// Build up an "ignore" string
for (String ignoreAttribute : AssertionConst.annotationTypes) {
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java?rev=1503438&r1=1503437&r2=1503438&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/train/TrainAttributeModels.java Mon Jul 15 19:51:22 2013
@@ -25,9 +25,10 @@ public class TrainAttributeModels {
params.add("--models-dir"); params.add(AssertionConst.modelDirectory);
// params.add("--evaluation-output-dir"); params.add(AssertionConst.evalOutputDir);
params.add("--train-only");
+ params.add("--feature-selection"); params.add("c");
// Build up an "ignore" string
- for (String ignoreAttribute : AssertionConst.annotationTypes) {
+ for (String ignoreAttribute : AssertionConst.allAnnotationTypes) {
if (!ignoreAttribute.equals(attribute)) {
if (ignoreAttribute.equals("historyOf")) {