You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2012/11/27 17:40:11 UTC
svn commit: r1414254 - in
/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature:
./ selection/
Author: clin
Date: Tue Nov 27 16:40:09 2012
New Revision: 1414254
URL: http://svn.apache.org/viewvc?rev=1414254&view=rev
Log:
updated feature extractors
Added:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (with props)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java (with props)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (with props)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java (with props)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (with props)
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
+
+ private String name;
+
+ private Map<String, double[]> textDoublesMap;
+
+ private double[] meanValues;
+
+ public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
+ super();
+ this.name = name;
+ this.textDoublesMap = textDoublesMap;
+ int nMapEntries = this.textDoublesMap.size();
+ if (nMapEntries == 0) {
+ throw new IllegalArgumentException("textDoublesMap cannot be empty");
+ }
+ int nValues = textDoublesMap.entrySet().iterator().next().getValue().length;
+ this.meanValues = new double[nValues];
+ for (double[] values : textDoublesMap.values()) {
+ for (int i = 0; i < values.length; ++i) {
+ this.meanValues[i] += values[i];
+ }
+ }
+ for (int i = 0; i < this.meanValues.length; ++i) {
+ this.meanValues[i] /= nMapEntries;
+ }
+ }
+
+ @Override
+ public List<Feature> extract(JCas view, Annotation annotation) throws CleartkExtractorException {
+ double[] values = this.textDoublesMap.get(annotation.getCoveredText());
+ if (values == null) {
+ values = this.meanValues;
+ }
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ for (int i = 0; i < values.length; ++i) {
+ String name = Feature.createName(this.name, String.valueOf(i));
+ features.add(new Feature(name, values[i]));
+ }
+ return features;
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class PhraseExtractor implements SimpleFeatureExtractor {
+
+ @Override
+ public List<Feature> extract(JCas jCas, Annotation token) throws CleartkExtractorException {
+ String featureValue = "NotNPVP";
+ for (Chunk chunk : JCasUtil.selectCovered(jCas, Chunk.class, token)) {
+ String chunkType = chunk.getChunkType();
+ if (chunkType.equals("NP")) {
+ featureValue = "NP";
+ break;
+ } else if (chunkType.equals("VP")) {
+ featureValue = "VP";
+ break;
+ }
+ }
+ return Collections.singletonList(new Feature("PhraseType", featureValue));
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.Predicate;
+import org.apache.ctakes.typesystem.type.textsem.SemanticArgument;
+import org.apache.ctakes.typesystem.type.textsem.SemanticRoleRelation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class SRLExtractor implements SimpleFeatureExtractor {
+
+ @Override
+ public List<Feature> extract(JCas jCas, Annotation focusAnnotation)
+ throws CleartkExtractorException {
+ // and cache the results so that we only do this once per CAS
+ String jCasText = jCas.getDocumentText();
+ String roleFeat = "SemanticRole";
+ String roleVerbFeat = "RoleAndVerb";
+ String verb = "noVerb";
+ Feature role = new Feature(roleFeat, "NoRole");
+ Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ for (Predicate predicate : JCasUtil.select(jCas, Predicate.class)) {
+
+ for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, predicate)) {
+ if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+ role = new Feature(roleFeat,"Predicate");
+ verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
+ roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
+
+ features.add(role);
+ //features.add(roleVerb);
+ return features;
+ }
+ }
+
+ for (SemanticRoleRelation relation : JCasUtil.select(
+ predicate.getRelations(),
+ SemanticRoleRelation.class)) {
+ SemanticArgument arg = relation.getArgument();
+ // System.out.format("\tArg: %s=%s \n", arg.getLabel(), arg.getCoveredText());
+ for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, arg)) {
+ if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+ String label = arg.getLabel();
+ Predicate currentPred = relation.getPredicate();
+ verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
+ role = new Feature(roleFeat, label);
+ roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
+
+ features.add(role);
+ //features.add(roleVerb);
+ return features;
+ }
+ }
+ }
+ }
+
+ features.add(role);
+ //features.add(roleVerb);
+ return features;
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,68 @@
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class SurfaceFormFeatureExtractor implements SimpleFeatureExtractor {
+
+ private final String SYMBOL = "Symbol";
+ private final String SYMBOL_REG = "\\W+";
+ private final String ALL_CAPITAL = "AllCapital";
+ private final String ALL_CAPITAL_REG = "[A-Z][A-Z]+";
+ private final String FIRST_CAPITAL = "FirstCapital";
+ private final String FIRST_CAPITAL_REG = "^[A-Z][a-z]+";
+ private final String SINGLE_CAPITAL = "SingelCapital";
+ private final String SINGLE_CAPITAL_REG = "^[A-Z]{1}$";
+ private final String SINGLE_LETTER ="SingleLetter";
+ private final String SINGLE_LETTER_REG = "^[a-z]{1}$";
+ private final String ALL_LOWER = "AllLower";
+ private final String ALL_LOWER_REG = "[a-z][a-z]+";
+ private final String NUMBER = "Number";
+ private final String NUMBER_REG ="[\\d]*\\.?[\\d]+";
+ private final String WORDNUMMIX ="WordNumberMix";
+ private final String WORDNUMMIX_REG ="[\\w][\\w]+";
+ private final String FEATURE_SURF = "Surface";
+ private final String FEATURE_LENGTH = "Length";
+
+ @Override
+ public List<Feature> extract(JCas view, Annotation focusAnnotation)
+ throws CleartkExtractorException {
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ String jCasText = view.getDocumentText();
+ int begin = focusAnnotation.getBegin();
+ int end = focusAnnotation.getEnd();
+ String text = jCasText == null ? null : jCasText.substring(begin, end);
+ features.add(new Feature(this.FEATURE_SURF, getStrType(text)));
+ int length = text.length();
+ if (length <=1) features.add(new Feature(this.FEATURE_LENGTH, "single"));
+ else features.add(new Feature(this.FEATURE_LENGTH, "multiple"));
+
+ // create a single feature from the text
+ return features;
+ }
+
+ public static void main(String[] args) throws Exception {
+ SurfaceFormFeatureExtractor se = new SurfaceFormFeatureExtractor();
+ String test = "a";
+ System.out.println("String type is :" + se.getStrType(test));
+ }
+
+ private String getStrType(String test) {
+ if ( test.matches(this.ALL_CAPITAL_REG)) return this.ALL_CAPITAL;
+ else if ( test.matches(ALL_LOWER_REG)) return this.ALL_LOWER;
+ else if ( test.matches(FIRST_CAPITAL_REG)) return this.FIRST_CAPITAL;
+ else if ( test.matches(NUMBER_REG)) return this.NUMBER;
+ else if ( test.matches(SINGLE_CAPITAL_REG)) return this.SINGLE_CAPITAL;
+ else if ( test.matches(SINGLE_LETTER_REG)) return this.SINGLE_LETTER;
+ else if ( test.matches(SYMBOL_REG)) return this.SYMBOL;
+ else if ( test.matches(WORDNUMMIX_REG)) return this.WORDNUMMIX;
+ else return "Nomatch";
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java?rev=1414254&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java Tue Nov 27 16:40:09 2012
@@ -0,0 +1,375 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ *
+ * Selects features via Chi-squared statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ *
+ * @author Chen Lin
+ *
+ */
+public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
+ implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
+
+ /**
+ * A Bounds implementation that puts no restrictions on the context.
+ */
+ private static class NoBounds implements Bounds {
+
+ public NoBounds() {
+ }
+
+ @Override
+ public boolean contains(Annotation annotation) {
+ return true;
+ }
+
+ }
+
+ /**
+ * A Bounds implementation that restricts the context to annotations within a given span.
+ */
+ private static class SpanBounds implements Bounds {
+
+ private int begin;
+
+ private int end;
+
+ public SpanBounds(int begin, int end) {
+ this.begin = begin;
+ this.end = end;
+ }
+
+ @Override
+ public boolean contains(Annotation annotation) {
+ return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
+ }
+
+ }
+
+ /**
+ * Helper class for aggregating and computing mutual Chi2 statistics
+ */
+ public static class Chi2Evaluator<OUTCOME_T> {
+ protected Multiset<OUTCOME_T> classCounts;
+
+ protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+ public Chi2Evaluator() {
+ this.classCounts = HashMultiset.<OUTCOME_T> create();
+ this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+ }
+
+ public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+ Integer count = this.featValueClassCount.get(featureName, outcome);
+ if (count == null) {
+ count = 0;
+ }
+ this.featValueClassCount.put(featureName, outcome, count + occurrences);
+ this.classCounts.add(outcome, occurrences);
+ }
+
+ public double Chi2Cal(String featureName) {
+ // notation index of 0 means false, 1 mean true
+ int numOfClass = this.classCounts.elementSet().size();
+ int[] posiOutcomeCounts = new int[numOfClass];
+ int[] outcomeCounts = new int[numOfClass];
+ int classId = 0;
+ int posiFeatCount = 0;
+ for ( OUTCOME_T clas: this.classCounts.elementSet()){
+ posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)?
+ this.featValueClassCount.get(featureName, clas)
+ : 0;
+ posiFeatCount += posiOutcomeCounts[classId];
+ outcomeCounts[classId] = this.classCounts.count(clas);
+ classId ++;
+ }
+
+ int n = this.classCounts.size();
+ int negaFeatCount = n - posiFeatCount;
+
+ double chi2val = 0.0;
+
+ if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
+ return chi2val;
+ }
+
+ for (int lbl =0; lbl < numOfClass; lbl++){
+ //for positive part of feature:
+ double expected = outcomeCounts[lbl]*posiFeatCount/(double)n;
+ if (expected > 0)
+ chi2val += Math.pow(posiOutcomeCounts[lbl]-expected,2)/expected;
+ //for negative part of feature:
+ expected = outcomeCounts[lbl]*negaFeatCount/(double)n;
+ double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
+ if (expected > 0)
+ chi2val += Math.pow(observ-expected,2)/expected;
+ }
+
+ return chi2val;
+ }
+
+
+ public void save(URI outputURI) throws IOException {
+ File out = new File(outputURI);
+ BufferedWriter writer = null;
+ writer = new BufferedWriter(new FileWriter(out));
+
+ // Write out header
+ writer.append("Chi2 FS Neighbor Data\n");
+ writer.append("Feature\t");
+ writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
+ writer.append("\n");
+
+ // Write out Chi2 values for all features
+ for (String featureName : this.featValueClassCount.rowKeySet()) {
+ writer.append(featureName);
+ writer.append("\t");
+ writer.append(String.format("%f", this.Chi2Cal(featureName)));
+ writer.append("\n");
+ }
+ writer.append("\n");
+ writer.append(this.featValueClassCount.toString());
+ writer.close();
+ }
+
+ public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
+ return new ComputeFeatureScore<OUTCOME_T>(this);
+ }
+
+ public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
+
+ private Chi2Evaluator<OUTCOME_T> stats;
+
+ public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
+ this.stats = stats;
+ }
+
+ @Override
+ public Double apply(String featureName) {
+ Double featureChi2 = stats.Chi2Cal(featureName);
+ return featureChi2;
+ }
+
+ }
+ }
+
+
+ protected boolean isTrained;
+ private CombinedExtractor subExtractor;
+ private List<String> selectedFeatures;
+ private double chi2Threshold;
+ private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
+ private Context[] contexts;
+ private Class<? extends Annotation> annotationClass;
+
+ public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
+ super(name);
+ this.annotationClass = annotationClass;
+ this.init(featureExtractor, 0.0);
+ this.contexts = contexts;
+ }
+
+ public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
+ super(name);
+ this.annotationClass = annotationClass;
+ this.init(featureExtractor, thres);
+ this.contexts = contexts;
+ }
+
+ private void init(CombinedExtractor featureExtractor, double thres) {
+ this.subExtractor= featureExtractor;
+ this.chi2Threshold = thres;
+ }
+
+ @Override
+ public List<Feature> extract(JCas view, Annotation focusAnnotation)
+ throws CleartkExtractorException {
+ List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+ List<Feature> result = new ArrayList<Feature>();
+ if (this.isTrained) {
+ // Filter out selected features
+ result.addAll(Collections2.filter(extracted, this));
+ } else {
+ // We haven't trained this extractor yet, so just mark the existing features
+ // for future modification, by creating one uber-container feature
+ result.add(new TransformableFeature(this.name, extracted));
+ }
+
+ return result;
+ }
+
+ public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
+ throws CleartkExtractorException {
+ List<Feature> extracted = new ArrayList<Feature>();
+ for (Context context : this.contexts) {
+ extracted.addAll(context.extract(
+ view,
+ focusAnnotation,
+ bounds,
+ this.annotationClass,
+ this.subExtractor));
+ }
+ List<Feature> result = new ArrayList<Feature>();
+ if (this.isTrained){
+ // Filter out selected features
+ result.addAll(Collections2.filter(extracted, this));
+ }else{
+ // We haven't trained this extractor yet, so just mark the existing features
+ // for future modification, by creating one uber-container feature
+ result.add(new TransformableFeature(this.name, extracted));
+ }
+
+ return result;
+ }
+
+ /**
+ * Extract features from the annotations around the focus annotation and within the given bounds.
+ *
+ * @param view
+ * The JCas containing the focus annotation.
+ * @param focusAnnotation
+ * The annotation whose context is to be searched.
+ * @param boundsAnnotation
+ * The boundary within which context annotations may be identified.
+ * @return The features extracted in the context of the focus annotation.
+ */
+ public List<Feature> extractWithin(
+ JCas view,
+ Annotation focusAnnotation,
+ Annotation boundsAnnotation) throws CleartkExtractorException {
+ Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
+ return this.extract(view, focusAnnotation, bounds);
+ }
+
+ @Override
+ public boolean apply(Feature feature) {
+ return this.selectedFeatures.contains(this.nameFeature(feature));
+ }
+
+ public String nameFeature(Feature feature) {
+ return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
+ + feature.getValue();
+ }
+
+ @Override
+ public void train(Iterable<Instance<OUTCOME_T>> instances) {
+ // aggregate statistics for all features
+ this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
+
+ for (Instance<OUTCOME_T> instance : instances) {
+ OUTCOME_T outcome = instance.getOutcome();
+ for (Feature feature : instance.getFeatures()) {
+ if (this.isTransformable(feature)) {
+ for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+ chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
+ }
+ }
+ }
+ }
+ // Compute mutual information score for each feature
+ Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
+
+
+ //step3: remove small chi2 valued features
+ Iterator<String> iter = featureNames.iterator();
+ ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+ while (iter.hasNext()){
+ String feat = iter.next();
+ Double chi2 = computeScore.apply(feat);
+ if(chi2 <= this.chi2Threshold){
+ iter.remove();
+ }
+ }
+
+ //step4:get selected features
+ this.selectedFeatures = Ordering.natural().onResultOf(
+ this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+ featureNames);
+
+ this.isTrained = true;
+
+ }
+
+ @Override
+ public void save(URI uri) throws IOException {
+ if (!this.isTrained) {
+ throw new IOException("Chi2FSExtractor: Cannot save before training.");
+ }
+ File out = new File(uri);
+ BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+ ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+ for (String feature : this.selectedFeatures) {
+ writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
+ }
+
+ writer.close();
+ }
+
+ @Override
+ public void load(URI uri) throws IOException {
+ this.selectedFeatures = Lists.newArrayList();
+ File in = new File(uri);
+ BufferedReader reader = new BufferedReader(new FileReader(in));
+
+ // The rest of the lines are feature + selection scores
+ String line = null;
+ //int n = 0;
+ while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
+ String[] featureValuePair = line.split("\\t");
+ this.selectedFeatures.add(featureValuePair[0]);
+ //n++;
+ }
+
+ reader.close();
+ this.isTrained = true;
+
+ }
+
+ @Override
+ public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
+ Annotation annotation2) throws CleartkExtractorException {
+ int begin = annotation1.getEnd();
+ int end = annotation2.getBegin();
+ // FIXME: creating a new annotation may leak memory - is there a better approach?
+ Annotation focusAnnotation = new Annotation(jCas, begin, end);
+ return this.extract(jCas, focusAnnotation, new NoBounds());
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain