You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2013/07/25 16:32:03 UTC

svn commit: r1506989 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/EventAnnotator.java ae/feature/selection/Chi2FeatureSelection.java eval/EvaluationOfEventSpans.java

Author: clin
Date: Thu Jul 25 14:32:02 2013
New Revision: 1506989

URL: http://svn.apache.org/r1506989
Log:
add a token count check for event annotator
change the feature selection threshold to be percentage X%. i.e. to select the top X% of all features
fix the smote and feature selection default values

Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Jul 25 14:32:02 2013
@@ -85,7 +85,7 @@ public class EventAnnotator extends Temp
       name = PARAM_FEATURE_SELECTION_THRESHOLD,
       mandatory = false,
       description = "the Chi-squared threshold at which features should be removed")
-  protected Float featureSelectionThreshold = 0f;
+  protected Float featureSelectionThreshold = 1f; //default is not using feature selection, i.e. select 100% of all features.
   
   public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
   
@@ -151,7 +151,7 @@ public class EventAnnotator extends Temp
   private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
 
   public static FeatureSelection<String> createFeatureSelection(double threshold) {
-    return new Chi2FeatureSelection<String>(EventAnnotator.FEATURE_SELECTION_NAME, threshold);
+    return new Chi2FeatureSelection<String>(EventAnnotator.FEATURE_SELECTION_NAME, threshold, false);
   }
   
   public static URI createFeatureSelectionURI(File outputDirectoryName) {
@@ -185,7 +185,7 @@ public class EventAnnotator extends Temp
         new Preceding(3),
         new Following(3));
 
-    if (featureSelectionThreshold == 0) {
+    if (featureSelectionThreshold == 1) {
       this.featureSelection = null;
     } else {
       this.featureSelection = EventAnnotator.createFeatureSelection(this.featureSelectionThreshold);
@@ -209,9 +209,11 @@ public class EventAnnotator extends Temp
     for (IdentifiedAnnotation entity : JCasUtil.select(jCas, IdentifiedAnnotation.class)) {
       if (!entity.getClass().equals(EventMention.class)) {
         List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, entity);
-        BaseToken lastToken = tokens.get(tokens.size() - 1);
-        String value = String.format("%s_%s", entity.getClass().getSimpleName(), entity.getTypeID());
-        endOfEntityFeatures.put(lastToken, new Feature("EndOf", value));
+        if (tokens.size() > 0){
+        	BaseToken lastToken = tokens.get(tokens.size() - 1);
+            String value = String.format("%s_%s", entity.getClass().getSimpleName(), entity.getTypeID());
+            endOfEntityFeatures.put(lastToken, new Feature("EndOf", value));
+        }       
       }
     }
 

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java Thu Jul 25 14:32:02 2013
@@ -7,6 +7,7 @@ import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.net.URI;
+import java.util.Set;
 
 import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.Instance;
@@ -30,183 +31,210 @@ import com.google.common.collect.Table;
  */
 public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
 
-  /**
-   * Helper class for aggregating and computing mutual Chi2 statistics
-   */
-  private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
-    protected Multiset<OUTCOME_T> classCounts;
+	/**
+	 * Helper class for aggregating and computing mutual Chi2 statistics
+	 */
+	private static class Chi2Scorer<OUTCOME_T> implements Function<String, Double> {
+		protected Multiset<OUTCOME_T> classCounts;
+
+		protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+		private boolean yates = false;
+
+		public Chi2Scorer(boolean yate) {
+			this.classCounts = HashMultiset.<OUTCOME_T> create();
+			this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+			this.yates = yate;
+		}
+
+		public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+			Integer count = this.featValueClassCount.get(featureName, outcome);
+			if (count == null) {
+				count = 0;
+			}
+			this.featValueClassCount.put(featureName, outcome, count + occurrences);
+			this.classCounts.add(outcome, occurrences);
+		}
+
+		public Double apply(String featureName) {
+			return this.score(featureName);
+		}
+
+		public double score(String featureName) {
+			// notation index of 0 means false, 1 mean true
+			// Contingency Table:
+			//      | class1  | class2  | class3  | sum
+			// posi |         |         |         | posiFeatCount
+			// nega |         |         |         | negaFeatCount
+			//      | outcnt1 | outcnt2 | outcnt3 | n
+
+			int numOfClass = this.classCounts.elementSet().size();
+			int[] posiOutcomeCounts = new int[numOfClass];
+			int[] outcomeCounts = new int[numOfClass];
+			int classId = 0;
+			int posiFeatCount = 0;
+			for (OUTCOME_T clas : this.classCounts.elementSet()) {
+				posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
+						? this.featValueClassCount.get(featureName, clas)
+								: 0;
+						posiFeatCount += posiOutcomeCounts[classId];
+						outcomeCounts[classId] = this.classCounts.count(clas);
+						classId++;
+			}
+
+			int n = this.classCounts.size();
+			int negaFeatCount = n - posiFeatCount;
+
+			double chi2val = 0.0;
+
+			if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
+				// feature, degree of freedom = 0
+				return chi2val;
+			}
+
+			for (int lbl = 0; lbl < numOfClass; lbl++) {
+				// for positive part of feature:
+				double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
+				if (expected > 0) {
+					double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
+					if (this.yates ) { // apply Yate's correction
+						diff -= 0.5;
+					}
+					if (diff > 0)
+						chi2val += Math.pow(diff, 2) / expected;
+				}
+
+				// for negative part of feature:
+				expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
+				double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
+				if (expected > 0) {
+					double diff = Math.abs(observ - expected);
+					if (this.yates) { // apply Yate's correction
+						diff -= 0.5;
+					}
+					if (diff > 0)
+						chi2val += Math.pow(diff, 2) / expected;
+				}
+			}
+
+			return chi2val;
+		}
+
+	}
+
+	/**
+	 * the percentage of total features that would be returned. range from 0% - 100%, i.e. [0,1]
+	 */
+	private double chi2Threshold;
 
-    protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+	private int numFeatures = 0;
+
+	private Chi2Scorer<OUTCOME_T> chi2Function;
 
 	private boolean yates = false;
 
-    public Chi2Scorer(boolean yate) {
-      this.classCounts = HashMultiset.<OUTCOME_T> create();
-      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
-      this.yates = yate;
-    }
-
-    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
-      Integer count = this.featValueClassCount.get(featureName, outcome);
-      if (count == null) {
-        count = 0;
-      }
-      this.featValueClassCount.put(featureName, outcome, count + occurrences);
-      this.classCounts.add(outcome, occurrences);
-    }
-    
-    public Double apply(String featureName) {
-      return this.score(featureName);
-    }
-
-    public double score(String featureName) {
-      // notation index of 0 means false, 1 mean true
-      // Contingency Table:
-      //      | class1  | class2  | class3  | sum
-      // posi |         |         |         | posiFeatCount
-      // nega |         |         |         | negaFeatCount
-      //      | outcnt1 | outcnt2 | outcnt3 | n
-
-      int numOfClass = this.classCounts.elementSet().size();
-      int[] posiOutcomeCounts = new int[numOfClass];
-      int[] outcomeCounts = new int[numOfClass];
-      int classId = 0;
-      int posiFeatCount = 0;
-      for (OUTCOME_T clas : this.classCounts.elementSet()) {
-        posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
-            ? this.featValueClassCount.get(featureName, clas)
-            : 0;
-        posiFeatCount += posiOutcomeCounts[classId];
-        outcomeCounts[classId] = this.classCounts.count(clas);
-        classId++;
-      }
-
-      int n = this.classCounts.size();
-      int negaFeatCount = n - posiFeatCount;
-
-      double chi2val = 0.0;
-
-      if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
-                                                      // feature, degree of freedom = 0
-        return chi2val;
-      }
-
-      for (int lbl = 0; lbl < numOfClass; lbl++) {
-        // for positive part of feature:
-        double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
-        if (expected > 0) {
-          double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
-          if (this.yates ) { // apply Yate's correction
-            diff -= 0.5;
-          }
-          if (diff > 0)
-            chi2val += Math.pow(diff, 2) / expected;
-        }
-
-        // for negative part of feature:
-        expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
-        double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
-        if (expected > 0) {
-          double diff = Math.abs(observ - expected);
-          if (this.yates) { // apply Yate's correction
-            diff -= 0.5;
-          }
-          if (diff > 0)
-            chi2val += Math.pow(diff, 2) / expected;
-        }
-      }
-
-      return chi2val;
-    }
-  }
-
-  private double chi2Threshold;
-
-  private Chi2Scorer<OUTCOME_T> chi2Function;
-  
-  private boolean yates = false;
-
-  public Chi2FeatureSelection(String name) {
-    this(name, 0.0);
-  }
-
-  public Chi2FeatureSelection(String name, double threshold) {
-    super(name);
-    this.chi2Threshold = threshold;
-  }
-
-  /**
-   * Constructor that can let use control the yate's correction
-   * @param name
-   * @param threshold
-   * @param yates : true for using yate's correction, false for turn off yate's correction
-   */
-  public Chi2FeatureSelection(String name, double threshold, boolean yates) {
-	    super(name);
-	    this.chi2Threshold = threshold;
-	    this.yates = yates;
-	  }
-  @Override
-  public boolean apply(Feature feature) {
-    return this.selectedFeatureNames.contains(this.getFeatureName(feature));
-  }
-
-  @Override
-  public void train(Iterable<Instance<OUTCOME_T>> instances) {
-    // aggregate statistics for all features
-    this.chi2Function = new Chi2Scorer<OUTCOME_T>(this.yates);
-    for (Instance<OUTCOME_T> instance : instances) {
-      OUTCOME_T outcome = instance.getOutcome();
-      for (Feature feature : instance.getFeatures()) {
-        if (this.isTransformable(feature)) {
-          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
-            this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
-          }
-        }
-      }
-    }
-    // keep only large chi2 valued features
-    this.selectedFeatureNames = Sets.newHashSet();
-    for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
-      if (this.chi2Function.score(featureName) > this.chi2Threshold) {
-        this.selectedFeatureNames.add(featureName);
-      }
-    }
-
-    this.isTrained = true;
-  }
-
-  @Override
-  public void save(URI uri) throws IOException {
-    if (!this.isTrained) {
-      throw new IllegalStateException("Cannot save before training");
-    }
-    File out = new File(uri);
-    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
-
-    Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
-    for (String feature : ordering.immutableSortedCopy(this.selectedFeatureNames)) {
-      writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
-    }
-
-    writer.close();
-  }
-
-  @Override
-  public void load(URI uri) throws IOException {
-    this.selectedFeatureNames = Sets.newLinkedHashSet();
-    File in = new File(uri);
-    BufferedReader reader = new BufferedReader(new FileReader(in));
-
-    // The lines are <feature-name>\t<feature-score>
-    String line = null;
-    while ((line = reader.readLine()) != null) {
-      String[] featureValuePair = line.split("\t");
-      this.selectedFeatureNames.add(featureValuePair[0]);
-    }
+	public Chi2FeatureSelection(String name) {
+		this(name, 0.0);
+	}
+
+	public Chi2FeatureSelection(String name, double threshold) {
+		super(name);
+		this.chi2Threshold = threshold;
+	}
+
+	/**
+	 * Constructor that can let use control the yate's correction
+	 * @param name
+	 * @param threshold
+	 * @param yates : true for using yate's correction, false for turn off yate's correction
+	 */
+	public Chi2FeatureSelection(String name, double threshold, boolean yates) {
+		super(name);
+		this.chi2Threshold = threshold;
+		this.yates = yates;
+	}
+	@Override
+	public boolean apply(Feature feature) {
+		return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+	}
+
+	@Override
+	public void train(Iterable<Instance<OUTCOME_T>> instances) {
+		//check if chi2Threshold is bigger whithin range:
+		if(this.chi2Threshold<0 || this.chi2Threshold>1){
+			System.err.println("Feature Selection threshold should be from 0 to 1");
+			System.exit(0);
+		}
+
+		// aggregate statistics for all features
+		this.chi2Function = new Chi2Scorer<OUTCOME_T>(this.yates);
+		for (Instance<OUTCOME_T> instance : instances) {
+			OUTCOME_T outcome = instance.getOutcome();
+			for (Feature feature : instance.getFeatures()) {
+				if (this.isTransformable(feature)) {
+					for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+						this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
+					}
+				}
+			}
+		}
+
+
+		//    // keep only large chi2 valued features
+		//    this.selectedFeatureNames = Sets.newHashSet();
+		//    for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
+		//      if (this.chi2Function.score(featureName) > this.chi2Threshold) {
+		//        this.selectedFeatureNames.add(featureName);
+		//      }
+		//    }
+
+		// sort features by Chi2 information score
+		Set<String> featureNames = this.chi2Function.featValueClassCount.rowKeySet();
+		Ordering<String> ordering = Ordering.natural().onResultOf(this.chi2Function).reverse();
+
+		int totalFeatures = featureNames.size();
+		this.numFeatures = (int) Math.round(totalFeatures*this.chi2Threshold);
+
+		// keep only the top N features
+		this.selectedFeatureNames = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(
+				0,
+				this.numFeatures));
+
+		this.isTrained = true;
+	}
+
+	@Override
+	public void save(URI uri) throws IOException {
+		if (!this.isTrained) {
+			throw new IllegalStateException("Cannot save before training");
+		}
+		File out = new File(uri);
+		BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+		for (String feature : this.selectedFeatureNames) {
+			writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
+		}
+
+		writer.close();
+	}
+
+	@Override
+	public void load(URI uri) throws IOException {
+		this.selectedFeatureNames = Sets.newLinkedHashSet();
+		File in = new File(uri);
+		BufferedReader reader = new BufferedReader(new FileReader(in));
+
+		// The lines are <feature-name>\t<feature-score>
+		String line = null;
+		int n = 0;
+		while ((line = reader.readLine()) != null && n < this.numFeatures) {
+			String[] featureValuePair = line.split("\t");
+			this.selectedFeatureNames.add(featureValuePair[0]);
+			n++;
+		}
 
-    reader.close();
-    this.isTrained = true;
+		reader.close();
+		this.isTrained = true;
 
-  }
+	}
 }

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1506989&r1=1506988&r2=1506989&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Thu Jul 25 14:32:02 2013
@@ -48,10 +48,10 @@ public class EvaluationOfEventSpans exte
     @Option(longName = "downratio", defaultValue = "1")
     public float getProbabilityOfKeepingANegativeExample();
 
-    @Option(longName = "featureSelectionThreshold", defaultValue = "0")
+    @Option(longName = "featureSelectionThreshold", defaultValue = "1")
     public float getFeatureSelectionThreshold();
 
-    @Option(longName = "SMOTENeighborNumber", defaultValue = "1")
+    @Option(longName = "SMOTENeighborNumber", defaultValue = "0")
     public float getSMOTENeighborNumber();
   }