You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2012/11/27 17:40:11 UTC
svn commit: r1414254 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature: ./ selection/

Author: clin
Date: Tue Nov 27 16:40:09 2012
New Revision: 1414254

URL: http://svn.apache.org/viewvc?rev=1414254&view=rev
Log:
updated feature extractors

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java   (with props)

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
+
+  private String name;
+
+  private Map<String, double[]> textDoublesMap;
+
+  private double[] meanValues;
+
+  public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
+    super();
+    this.name = name;
+    this.textDoublesMap = textDoublesMap;
+    int nMapEntries = this.textDoublesMap.size();
+    if (nMapEntries == 0) {
+      throw new IllegalArgumentException("textDoublesMap cannot be empty");
+    }
+    int nValues = textDoublesMap.entrySet().iterator().next().getValue().length;
+    this.meanValues = new double[nValues];
+    for (double[] values : textDoublesMap.values()) {
+      for (int i = 0; i < values.length; ++i) {
+        this.meanValues[i] += values[i];
+      }
+    }
+    for (int i = 0; i < this.meanValues.length; ++i) {
+      this.meanValues[i] /= nMapEntries;
+    }
+  }
+
+  @Override
+  public List<Feature> extract(JCas view, Annotation annotation) throws CleartkExtractorException {
+    double[] values = this.textDoublesMap.get(annotation.getCoveredText());
+    if (values == null) {
+      values = this.meanValues;
+    }
+    ArrayList<Feature> features = new ArrayList<Feature>();
+    for (int i = 0; i < values.length; ++i) {
+      String name = Feature.createName(this.name, String.valueOf(i));
+      features.add(new Feature(name, values[i]));
+    }
+    return features;
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class PhraseExtractor implements SimpleFeatureExtractor {
+
+  @Override
+  public List<Feature> extract(JCas jCas, Annotation token) throws CleartkExtractorException {
+    String featureValue = "NotNPVP";
+    for (Chunk chunk : JCasUtil.selectCovered(jCas, Chunk.class, token)) {
+      String chunkType = chunk.getChunkType();
+      if (chunkType.equals("NP")) {
+        featureValue = "NP";
+        break;
+      } else if (chunkType.equals("VP")) {
+        featureValue = "VP";
+        break;
+      }
+    }
+    return Collections.singletonList(new Feature("PhraseType", featureValue));
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.Predicate;
+import org.apache.ctakes.typesystem.type.textsem.SemanticArgument;
+import org.apache.ctakes.typesystem.type.textsem.SemanticRoleRelation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class SRLExtractor implements SimpleFeatureExtractor {
+
+  @Override
+  public List<Feature> extract(JCas jCas, Annotation focusAnnotation)
+      throws CleartkExtractorException {
+    // and cache the results so that we only do this once per CAS
+	String jCasText = jCas.getDocumentText();
+	String roleFeat = "SemanticRole";
+	String roleVerbFeat = "RoleAndVerb";
+	String verb = "noVerb";
+    Feature role = new Feature(roleFeat, "NoRole");
+    Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
+    ArrayList<Feature> features = new ArrayList<Feature>();
+    for (Predicate predicate : JCasUtil.select(jCas, Predicate.class)) {
+
+      for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, predicate)) {
+        if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+          role = new Feature(roleFeat,"Predicate");
+          verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
+          roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
+          
+          features.add(role);
+          //features.add(roleVerb);
+          return features;
+        }
+      }
+
+      for (SemanticRoleRelation relation : JCasUtil.select(
+          predicate.getRelations(),
+          SemanticRoleRelation.class)) {
+        SemanticArgument arg = relation.getArgument();
+        // System.out.format("\tArg: %s=%s \n", arg.getLabel(), arg.getCoveredText());
+        for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, arg)) {
+          if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+            String label = arg.getLabel();
+            Predicate currentPred = relation.getPredicate();
+            verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
+            role = new Feature(roleFeat, label);
+            roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
+            
+            features.add(role);
+            //features.add(roleVerb);
+            return features;
+          }
+        }
+      }
+    }
+
+    features.add(role);
+    //features.add(roleVerb);
+    return features;
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,68 @@
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class SurfaceFormFeatureExtractor implements SimpleFeatureExtractor {
+
+	private final String SYMBOL = "Symbol";
+	private final String SYMBOL_REG = "\\W+";
+	private final String ALL_CAPITAL = "AllCapital";
+	private final String ALL_CAPITAL_REG = "[A-Z][A-Z]+";
+	private final String FIRST_CAPITAL = "FirstCapital";
+	private final String FIRST_CAPITAL_REG = "^[A-Z][a-z]+";
+	private final String SINGLE_CAPITAL = "SingelCapital";
+	private final String SINGLE_CAPITAL_REG = "^[A-Z]{1}$";
+	private final String SINGLE_LETTER ="SingleLetter";
+	private final String SINGLE_LETTER_REG = "^[a-z]{1}$";
+	private final String ALL_LOWER = "AllLower";
+	private final String ALL_LOWER_REG = "[a-z][a-z]+";
+	private final String NUMBER = "Number";
+	private final String NUMBER_REG ="[\\d]*\\.?[\\d]+";
+	private final String WORDNUMMIX ="WordNumberMix";
+	private final String WORDNUMMIX_REG ="[\\w][\\w]+";
+	private final String FEATURE_SURF = "Surface";
+	private final String FEATURE_LENGTH = "Length";
+	
+	@Override
+	public List<Feature> extract(JCas view, Annotation focusAnnotation)
+			throws CleartkExtractorException {
+		ArrayList<Feature> features = new ArrayList<Feature>();
+		String jCasText = view.getDocumentText();
+	    int begin = focusAnnotation.getBegin();
+	    int end = focusAnnotation.getEnd();
+	    String text = jCasText == null ? null : jCasText.substring(begin, end);
+	    features.add(new Feature(this.FEATURE_SURF, getStrType(text)));
+	    int length = text.length();
+	    if (length <=1) features.add(new Feature(this.FEATURE_LENGTH, "single"));
+	    else features.add(new Feature(this.FEATURE_LENGTH, "multiple"));
+
+	    // create a single feature from the text
+	    return features;
+	}
+	
+	public static void main(String[] args) throws Exception {
+		SurfaceFormFeatureExtractor se = new SurfaceFormFeatureExtractor();
+		String test = "a";
+		System.out.println("String type is :" + se.getStrType(test));
+	}
+
+	private String getStrType(String test) {
+		if ( test.matches(this.ALL_CAPITAL_REG)) return this.ALL_CAPITAL;
+		else if ( test.matches(ALL_LOWER_REG)) return this.ALL_LOWER;
+		else if ( test.matches(FIRST_CAPITAL_REG)) return this.FIRST_CAPITAL;
+		else if ( test.matches(NUMBER_REG)) return this.NUMBER;
+		else if ( test.matches(SINGLE_CAPITAL_REG)) return this.SINGLE_CAPITAL;
+		else if ( test.matches(SINGLE_LETTER_REG)) return this.SINGLE_LETTER;
+		else if ( test.matches(SYMBOL_REG)) return this.SYMBOL;
+		else if ( test.matches(WORDNUMMIX_REG)) return this.WORDNUMMIX;
+		else return "Nomatch";
+	}
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,375 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ * 
+ * Selects features via Chi-squared statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Chen Lin
+ * 
+ */
+public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
+		implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
+	
+			/**
+			   * A Bounds implementation that puts no restrictions on the context.
+			   */
+			  private static class NoBounds implements Bounds {
+
+			    public NoBounds() {
+			    }
+
+			    @Override
+			    public boolean contains(Annotation annotation) {
+			      return true;
+			    }
+
+			  }
+
+			/**
+			   * A Bounds implementation that restricts the context to annotations within a given span.
+			   */
+		private static class SpanBounds implements Bounds {
+
+		private int begin;
+
+		private int end;
+
+		public SpanBounds(int begin, int end) {
+		    this.begin = begin;
+		    this.end = end;
+		}
+
+		@Override
+		public boolean contains(Annotation annotation) {
+		    return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
+		}
+
+	}
+
+		/**
+		   * Helper class for aggregating and computing mutual Chi2 statistics
+		*/
+		public static class Chi2Evaluator<OUTCOME_T> {
+			 protected Multiset<OUTCOME_T> classCounts;
+
+			 protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+			 public Chi2Evaluator() {
+			      this.classCounts = HashMultiset.<OUTCOME_T> create();
+			      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+			 }
+
+			 public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+			      Integer count = this.featValueClassCount.get(featureName, outcome);
+			      if (count == null) {
+			        count = 0;
+			      }
+			      this.featValueClassCount.put(featureName, outcome, count + occurrences);
+			      this.classCounts.add(outcome, occurrences);
+			 }
+
+			 public double Chi2Cal(String featureName) {
+			      // notation index of 0 means false, 1 mean true
+				  int numOfClass = this.classCounts.elementSet().size();
+			      int[] posiOutcomeCounts = new int[numOfClass];
+			      int[] outcomeCounts = new int[numOfClass];
+			      int classId = 0;
+			      int posiFeatCount = 0;
+			      for ( OUTCOME_T clas: this.classCounts.elementSet()){
+			    	  posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)? 
+			    			  this.featValueClassCount.get(featureName, clas)
+					          : 0;
+			    	  posiFeatCount += posiOutcomeCounts[classId];
+			    	  outcomeCounts[classId] = this.classCounts.count(clas);
+			    	  classId ++;
+			      }
+			      
+			      int n = this.classCounts.size();
+			      int negaFeatCount = n - posiFeatCount;
+			      
+			      double chi2val = 0.0;
+			      
+			      if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
+			    	  return chi2val;			    	  
+			      }
+			      
+			      for (int lbl =0; lbl < numOfClass; lbl++){
+			    	  //for positive part of feature:
+			    	  double expected = outcomeCounts[lbl]*posiFeatCount/(double)n;
+			    	  if (expected > 0)
+			    		  chi2val += Math.pow(posiOutcomeCounts[lbl]-expected,2)/expected;
+			    	  //for negative part of feature:
+			    	  expected = outcomeCounts[lbl]*negaFeatCount/(double)n;
+			    	  double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
+			    	  if (expected > 0)
+			    		  chi2val += Math.pow(observ-expected,2)/expected;
+			      }
+
+			      return chi2val;
+			    }
+
+			    
+			 public void save(URI outputURI) throws IOException {
+			      File out = new File(outputURI);
+			      BufferedWriter writer = null;
+			      writer = new BufferedWriter(new FileWriter(out));
+
+			      // Write out header
+			      writer.append("Chi2 FS Neighbor Data\n");
+			      writer.append("Feature\t");
+			      writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
+			      writer.append("\n");
+
+			      // Write out Chi2 values for all features
+			      for (String featureName : this.featValueClassCount.rowKeySet()) {
+			        writer.append(featureName);
+			        writer.append("\t");
+			        writer.append(String.format("%f", this.Chi2Cal(featureName)));
+			        writer.append("\n");
+			      }
+			      writer.append("\n");
+			      writer.append(this.featValueClassCount.toString());
+			      writer.close();
+			    }
+			 
+			 public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
+			      return new ComputeFeatureScore<OUTCOME_T>(this);
+			    }
+
+			    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
+
+			      private Chi2Evaluator<OUTCOME_T> stats;
+
+			      public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
+			        this.stats = stats;
+			      }
+
+			      @Override
+			      public Double apply(String featureName) {
+			        Double featureChi2 = stats.Chi2Cal(featureName);
+			        return featureChi2;
+			      }
+
+			    }
+	}
+			
+			
+	protected boolean isTrained;
+	private CombinedExtractor subExtractor;
+	private List<String> selectedFeatures;
+	private double chi2Threshold;
+	private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
+	private Context[] contexts;
+	private Class<? extends Annotation> annotationClass;
+
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, 0.0);
+		this.contexts = contexts;
+	}
+	
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, thres);
+		this.contexts = contexts;
+	}
+
+	private void init(CombinedExtractor featureExtractor, double thres) {
+		this.subExtractor= featureExtractor;
+		this.chi2Threshold = thres;
+	}
+
+	@Override
+	public List<Feature> extract(JCas view, Annotation focusAnnotation)
+			throws CleartkExtractorException {
+		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+	    List<Feature> result = new ArrayList<Feature>();
+	    if (this.isTrained) {
+	      // Filter out selected features
+	      result.addAll(Collections2.filter(extracted, this));
+	    } else {
+	      // We haven't trained this extractor yet, so just mark the existing features
+	      // for future modification, by creating one uber-container feature
+	      result.add(new TransformableFeature(this.name, extracted));
+	    }
+
+	    return result;
+	}
+	
+	public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
+		      throws CleartkExtractorException {
+		    List<Feature> extracted = new ArrayList<Feature>();
+		    for (Context context : this.contexts) {
+			      extracted.addAll(context.extract(
+			          view,
+			          focusAnnotation,
+			          bounds,
+			          this.annotationClass,
+			          this.subExtractor));
+			    }
+		    List<Feature> result = new ArrayList<Feature>();
+		    if (this.isTrained){
+		    	// Filter out selected features
+			    result.addAll(Collections2.filter(extracted, this));
+		    }else{
+		    	// We haven't trained this extractor yet, so just mark the existing features
+			    // for future modification, by creating one uber-container feature
+			    result.add(new TransformableFeature(this.name, extracted));
+		    }
+		    
+		    return result;
+		  }
+
+	/**
+	  * Extract features from the annotations around the focus annotation and within the given bounds.
+	   * 
+	   * @param view
+	   *          The JCas containing the focus annotation.
+	   * @param focusAnnotation
+	   *          The annotation whose context is to be searched.
+	   * @param boundsAnnotation
+	   *          The boundary within which context annotations may be identified.
+	   * @return The features extracted in the context of the focus annotation.
+	   */
+	public List<Feature> extractWithin(
+	      JCas view,
+	      Annotation focusAnnotation,
+	      Annotation boundsAnnotation) throws CleartkExtractorException {
+	    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
+	    return this.extract(view, focusAnnotation, bounds);
+	}
+	  
+	@Override
+	public boolean apply(Feature feature) {
+		return this.selectedFeatures.contains(this.nameFeature(feature));
+	}
+	
+	public String nameFeature(Feature feature) {
+	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
+	        + feature.getValue();
+	  }
+
+	@Override
+	public void train(Iterable<Instance<OUTCOME_T>> instances) {
+		// aggregate statistics for all features
+	    this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
+
+	    for (Instance<OUTCOME_T> instance : instances) {
+	      OUTCOME_T outcome = instance.getOutcome();
+	      for (Feature feature : instance.getFeatures()) {
+	        if (this.isTransformable(feature)) {
+	          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+	        	  chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
+	          }
+	        }
+	      }
+	    }
+	    // Compute mutual information score for each feature
+	    Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
+
+	
+		//step3: remove small chi2 valued features
+	    Iterator<String> iter = featureNames.iterator();
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    while (iter.hasNext()){
+	    	String feat = iter.next();
+	    	Double chi2 = computeScore.apply(feat);
+	    	if(chi2 <= this.chi2Threshold){
+	    		iter.remove();
+	    	}
+	    }
+	    
+	    //step4:get selected features
+	    this.selectedFeatures = Ordering.natural().onResultOf(
+        this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+        featureNames);
+		
+		this.isTrained = true;
+		
+	}
+
+	@Override
+	public void save(URI uri) throws IOException {
+		if (!this.isTrained) {
+		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
+		}
+		File out = new File(uri);
+	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    for (String feature : this.selectedFeatures) {
+	      writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
+	    }
+
+	    writer.close();
+	}
+
+	@Override
+	public void load(URI uri) throws IOException {
+		this.selectedFeatures = Lists.newArrayList();
+	    File in = new File(uri);
+	    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+	    // The rest of the lines are feature + selection scores
+	    String line = null;
+	    //int n = 0;
+	    while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
+	      String[] featureValuePair = line.split("\\t");
+	      this.selectedFeatures.add(featureValuePair[0]);
+	      //n++;
+	    }
+
+	    reader.close();
+	    this.isTrained = true;
+		
+	}
+
+	@Override
+	public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
+			Annotation annotation2) throws CleartkExtractorException {
+		int begin = annotation1.getEnd();
+	    int end = annotation2.getBegin();
+	    // FIXME: creating a new annotation may leak memory - is there a better approach?
+	    Annotation focusAnnotation = new Annotation(jCas, begin, end);
+	    return this.extract(jCas, focusAnnotation, new NoBounds());
+	}
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain