You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2013/07/25 16:32:03 UTC
svn commit: r1506989 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/EventAnnotator.java ae/feature/selection/Chi2FeatureSelection.java
eval/EvaluationOfEventSpans.java
Author: clin
Date: Thu Jul 25 14:32:02 2013
New Revision: 1506989
URL: http://svn.apache.org/r1506989
Log:
add a token count check for event annotator
change the feature selection threshold to be percentage X%. i.e. to select the top X% of all features
fix the smote and feature selection default values
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Jul 25 14:32:02 2013
@@ -85,7 +85,7 @@ public class EventAnnotator extends Temp
name = PARAM_FEATURE_SELECTION_THRESHOLD,
mandatory = false,
description = "the Chi-squared threshold at which features should be removed")
- protected Float featureSelectionThreshold = 0f;
+ protected Float featureSelectionThreshold = 1f; //default is not using feature selection, i.e. select 100% of all features.
public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
@@ -151,7 +151,7 @@ public class EventAnnotator extends Temp
private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
public static FeatureSelection<String> createFeatureSelection(double threshold) {
- return new Chi2FeatureSelection<String>(EventAnnotator.FEATURE_SELECTION_NAME, threshold);
+ return new Chi2FeatureSelection<String>(EventAnnotator.FEATURE_SELECTION_NAME, threshold, false);
}
public static URI createFeatureSelectionURI(File outputDirectoryName) {
@@ -185,7 +185,7 @@ public class EventAnnotator extends Temp
new Preceding(3),
new Following(3));
- if (featureSelectionThreshold == 0) {
+ if (featureSelectionThreshold == 1) {
this.featureSelection = null;
} else {
this.featureSelection = EventAnnotator.createFeatureSelection(this.featureSelectionThreshold);
@@ -209,9 +209,11 @@ public class EventAnnotator extends Temp
for (IdentifiedAnnotation entity : JCasUtil.select(jCas, IdentifiedAnnotation.class)) {
if (!entity.getClass().equals(EventMention.class)) {
List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, entity);
- BaseToken lastToken = tokens.get(tokens.size() - 1);
- String value = String.format("%s_%s", entity.getClass().getSimpleName(), entity.getTypeID());
- endOfEntityFeatures.put(lastToken, new Feature("EndOf", value));
+ if (tokens.size() > 0){
+ BaseToken lastToken = tokens.get(tokens.size() - 1);
+ String value = String.format("%s_%s", entity.getClass().getSimpleName(), entity.getTypeID());
+ endOfEntityFeatures.put(lastToken, new Feature("EndOf", value));
+ }
}
}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java Thu Jul 25 14:32:02 2013
@@ -7,6 +7,7 @@ import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URI;
+import java.util.Set;
import org.cleartk.classifier.Feature;
import org.cleartk.classifier.Instance;
@@ -30,183 +31,210 @@ import com.google.common.collect.Table;
*/
public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
- /**
- * Helper class for aggregating and computing mutual Chi2 statistics
- */
- private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
- protected Multiset<OUTCOME_T> classCounts;
+ /**
+ * Helper class for aggregating and computing mutual Chi2 statistics
+ */
+ private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
+ protected Multiset<OUTCOME_T> classCounts;
+
+ protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+ private boolean yates = false;
+
+ public Chi2Scorer(boolean yate) {
+ this.classCounts = HashMultiset.<OUTCOME_T> create();
+ this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+ this.yates = yate;
+ }
+
+ public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+ Integer count = this.featValueClassCount.get(featureName, outcome);
+ if (count == null) {
+ count = 0;
+ }
+ this.featValueClassCount.put(featureName, outcome, count + occurrences);
+ this.classCounts.add(outcome, occurrences);
+ }
+
+ public Double apply(String featureName) {
+ return this.score(featureName);
+ }
+
+ public double score(String featureName) {
+ // notation index of 0 means false, 1 mean true
+ // Contingency Table:
+ // | class1 | class2 | class3 | sum
+ // posi | | | | posiFeatCount
+ // nega | | | | negaFeatCount
+ // | outcnt1 | outcnt2 | outcnt3 | n
+
+ int numOfClass = this.classCounts.elementSet().size();
+ int[] posiOutcomeCounts = new int[numOfClass];
+ int[] outcomeCounts = new int[numOfClass];
+ int classId = 0;
+ int posiFeatCount = 0;
+ for (OUTCOME_T clas : this.classCounts.elementSet()) {
+ posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
+ ? this.featValueClassCount.get(featureName, clas)
+ : 0;
+ posiFeatCount += posiOutcomeCounts[classId];
+ outcomeCounts[classId] = this.classCounts.count(clas);
+ classId++;
+ }
+
+ int n = this.classCounts.size();
+ int negaFeatCount = n - posiFeatCount;
+
+ double chi2val = 0.0;
+
+ if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
+ // feature, degree of freedom = 0
+ return chi2val;
+ }
+
+ for (int lbl = 0; lbl < numOfClass; lbl++) {
+ // for positive part of feature:
+ double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
+ if (expected > 0) {
+ double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
+ if (this.yates ) { // apply Yate's correction
+ diff -= 0.5;
+ }
+ if (diff > 0)
+ chi2val += Math.pow(diff, 2) / expected;
+ }
+
+ // for negative part of feature:
+ expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
+ double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
+ if (expected > 0) {
+ double diff = Math.abs(observ - expected);
+ if (this.yates) { // apply Yate's correction
+ diff -= 0.5;
+ }
+ if (diff > 0)
+ chi2val += Math.pow(diff, 2) / expected;
+ }
+ }
+
+ return chi2val;
+ }
+
+ }
+
+ /**
+ * the percentage of total features that would be returned. range from 0% - 100%, i.e. [0,1]
+ */
+ private double chi2Threshold;
- protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+ private int numFeatures = 0;
+
+ private Chi2Scorer<OUTCOME_T> chi2Function;
private boolean yates = false;
- public Chi2Scorer(boolean yate) {
- this.classCounts = HashMultiset.<OUTCOME_T> create();
- this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
- this.yates = yate;
- }
-
- public void update(String featureName, OUTCOME_T outcome, int occurrences) {
- Integer count = this.featValueClassCount.get(featureName, outcome);
- if (count == null) {
- count = 0;
- }
- this.featValueClassCount.put(featureName, outcome, count + occurrences);
- this.classCounts.add(outcome, occurrences);
- }
-
- public Double apply(String featureName) {
- return this.score(featureName);
- }
-
- public double score(String featureName) {
- // notation index of 0 means false, 1 mean true
- // Contingency Table:
- // | class1 | class2 | class3 | sum
- // posi | | | | posiFeatCount
- // nega | | | | negaFeatCount
- // | outcnt1 | outcnt2 | outcnt3 | n
-
- int numOfClass = this.classCounts.elementSet().size();
- int[] posiOutcomeCounts = new int[numOfClass];
- int[] outcomeCounts = new int[numOfClass];
- int classId = 0;
- int posiFeatCount = 0;
- for (OUTCOME_T clas : this.classCounts.elementSet()) {
- posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
- ? this.featValueClassCount.get(featureName, clas)
- : 0;
- posiFeatCount += posiOutcomeCounts[classId];
- outcomeCounts[classId] = this.classCounts.count(clas);
- classId++;
- }
-
- int n = this.classCounts.size();
- int negaFeatCount = n - posiFeatCount;
-
- double chi2val = 0.0;
-
- if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
- // feature, degree of freedom = 0
- return chi2val;
- }
-
- for (int lbl = 0; lbl < numOfClass; lbl++) {
- // for positive part of feature:
- double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
- if (expected > 0) {
- double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
- if (this.yates ) { // apply Yate's correction
- diff -= 0.5;
- }
- if (diff > 0)
- chi2val += Math.pow(diff, 2) / expected;
- }
-
- // for negative part of feature:
- expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
- double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
- if (expected > 0) {
- double diff = Math.abs(observ - expected);
- if (this.yates) { // apply Yate's correction
- diff -= 0.5;
- }
- if (diff > 0)
- chi2val += Math.pow(diff, 2) / expected;
- }
- }
-
- return chi2val;
- }
- }
-
- private double chi2Threshold;
-
- private Chi2Scorer<OUTCOME_T> chi2Function;
-
- private boolean yates = false;
-
- public Chi2FeatureSelection(String name) {
- this(name, 0.0);
- }
-
- public Chi2FeatureSelection(String name, double threshold) {
- super(name);
- this.chi2Threshold = threshold;
- }
-
- /**
- * Constructor that can let use control the yate's correction
- * @param name
- * @param threshold
- * @param yates : true for using yate's correction, false for turn off yate's correction
- */
- public Chi2FeatureSelection(String name, double threshold, boolean yates) {
- super(name);
- this.chi2Threshold = threshold;
- this.yates = yates;
- }
- @Override
- public boolean apply(Feature feature) {
- return this.selectedFeatureNames.contains(this.getFeatureName(feature));
- }
-
- @Override
- public void train(Iterable<Instance<OUTCOME_T>> instances) {
- // aggregate statistics for all features
- this.chi2Function = new Chi2Scorer<OUTCOME_T>(this.yates);
- for (Instance<OUTCOME_T> instance : instances) {
- OUTCOME_T outcome = instance.getOutcome();
- for (Feature feature : instance.getFeatures()) {
- if (this.isTransformable(feature)) {
- for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
- this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
- }
- }
- }
- }
- // keep only large chi2 valued features
- this.selectedFeatureNames = Sets.newHashSet();
- for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
- if (this.chi2Function.score(featureName) > this.chi2Threshold) {
- this.selectedFeatureNames.add(featureName);
- }
- }
-
- this.isTrained = true;
- }
-
- @Override
- public void save(URI uri) throws IOException {
- if (!this.isTrained) {
- throw new IllegalStateException("Cannot save before training");
- }
- File out = new File(uri);
- BufferedWriter writer = new BufferedWriter(new FileWriter(out));
-
- Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
- for (String feature : ordering.immutableSortedCopy(this.selectedFeatureNames)) {
- writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
- }
-
- writer.close();
- }
-
- @Override
- public void load(URI uri) throws IOException {
- this.selectedFeatureNames = Sets.newLinkedHashSet();
- File in = new File(uri);
- BufferedReader reader = new BufferedReader(new FileReader(in));
-
- // The lines are <feature-name>\t<feature-score>
- String line = null;
- while ((line = reader.readLine()) != null) {
- String[] featureValuePair = line.split("\t");
- this.selectedFeatureNames.add(featureValuePair[0]);
- }
+ public Chi2FeatureSelection(String name) {
+ this(name, 0.0);
+ }
+
+ public Chi2FeatureSelection(String name, double threshold) {
+ super(name);
+ this.chi2Threshold = threshold;
+ }
+
+ /**
+ * Constructor that can let use control the yate's correction
+ * @param name
+ * @param threshold
+ * @param yates : true for using yate's correction, false for turn off yate's correction
+ */
+ public Chi2FeatureSelection(String name, double threshold, boolean yates) {
+ super(name);
+ this.chi2Threshold = threshold;
+ this.yates = yates;
+ }
+ @Override
+ public boolean apply(Feature feature) {
+ return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+ }
+
+ @Override
+ public void train(Iterable<Instance<OUTCOME_T>> instances) {
+ //check if chi2Threshold is bigger whithin range:
+ if(this.chi2Threshold<0 || this.chi2Threshold>1){
+ System.err.println("Feature Selection threshold should be from 0 to 1");
+ System.exit(0);
+ }
+
+ // aggregate statistics for all features
+ this.chi2Function = new Chi2Scorer<OUTCOME_T>(this.yates);
+ for (Instance<OUTCOME_T> instance : instances) {
+ OUTCOME_T outcome = instance.getOutcome();
+ for (Feature feature : instance.getFeatures()) {
+ if (this.isTransformable(feature)) {
+ for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+ this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
+ }
+ }
+ }
+ }
+
+
+ // // keep only large chi2 valued features
+ // this.selectedFeatureNames = Sets.newHashSet();
+ // for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
+ // if (this.chi2Function.score(featureName) > this.chi2Threshold) {
+ // this.selectedFeatureNames.add(featureName);
+ // }
+ // }
+
+ // sort features by Chi2 information score
+ Set<String> featureNames = this.chi2Function.featValueClassCount.rowKeySet();
+ Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
+
+ int totalFeatures = featureNames.size();
+ this.numFeatures = (int) Math.round(totalFeatures*this.chi2Threshold);
+
+ // keep only the top N features
+ this.selectedFeatureNames = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(
+ 0,
+ this.numFeatures));
+
+ this.isTrained = true;
+ }
+
+ @Override
+ public void save(URI uri) throws IOException {
+ if (!this.isTrained) {
+ throw new IllegalStateException("Cannot save before training");
+ }
+ File out = new File(uri);
+ BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+ for (String feature : this.selectedFeatureNames) {
+ writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
+ }
+
+ writer.close();
+ }
+
+ @Override
+ public void load(URI uri) throws IOException {
+ this.selectedFeatureNames = Sets.newLinkedHashSet();
+ File in = new File(uri);
+ BufferedReader reader = new BufferedReader(new FileReader(in));
+
+ // The lines are <feature-name>\t<feature-score>
+ String line = null;
+ int n = 0;
+ while ((line = reader.readLine()) != null && n < this.numFeatures) {
+ String[] featureValuePair = line.split("\t");
+ this.selectedFeatureNames.add(featureValuePair[0]);
+ n++;
+ }
- reader.close();
- this.isTrained = true;
+ reader.close();
+ this.isTrained = true;
- }
+ }
}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Thu Jul 25 14:32:02 2013
@@ -48,10 +48,10 @@ public class EvaluationOfEventSpans exte
@Option(longName = "downratio", defaultValue = "1")
public float getProbabilityOfKeepingANegativeExample();
- @Option(longName = "featureSelectionThreshold", defaultValue = "0")
+ @Option(longName = "featureSelectionThreshold", defaultValue = "1")
public float getFeatureSelectionThreshold();
- @Option(longName = "SMOTENeighborNumber", defaultValue = "1")
+ @Option(longName = "SMOTENeighborNumber", defaultValue = "0")
public float getSMOTENeighborNumber();
}