You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/20 00:14:56 UTC
svn commit: r1424215 - in
/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/ ae/feature/ eval/
Author: stevenbethard
Date: Wed Dec 19 23:14:55 2012
New Revision: 1424215
URL: http://svn.apache.org/viewvc?rev=1424215&view=rev
Log:
Fixes and refactors a bunch of really awful code that was recently introduced.
Modified:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Wed Dec 19 23:14:55 2012
@@ -1,4 +1,5 @@
package org.apache.ctakes.temporal.ae;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
@@ -18,20 +19,15 @@ package org.apache.ctakes.temporal.ae;
* under the License.
*/
-
import java.io.File;
import java.io.IOException;
import java.net.URI;
-//import java.net.URL;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
-//import java.util.logging.Logger;
-//import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
@@ -47,10 +43,8 @@ import org.apache.uima.analysis_engine.A
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.classifier.CleartkAnnotator;
-//import org.cleartk.classifier.DataWriter;
import org.cleartk.classifier.Feature;
import org.cleartk.classifier.Instance;
-//import org.cleartk.classifier.feature.transform.InstanceDataWriter;
import org.cleartk.classifier.chunking.BIOChunking;
import org.cleartk.classifier.feature.extractor.CleartkExtractor;
import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
@@ -64,67 +58,74 @@ import org.cleartk.classifier.feature.ex
import org.cleartk.classifier.jar.DefaultDataWriterFactory;
import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ConfigurationParameterFactory;
-import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.util.JCasUtil;
-//import com.google.common.base.Charsets;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
-//import com.google.common.io.LineProcessor;
-//import com.google.common.io.Resources;
public class EventAnnotator extends CleartkAnnotator<String> {
public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
@ConfigurationParameter(
- name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
- mandatory = false,
- description = "probability that a negative example should be retained for training")
- protected Float probabilityOfKeepingANegativeExample = 0.8f;
-
- public static final String PARAM_FEATURE_TRIM_ORNOT = "WhetherToDoFeatureSelection";
+ name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ mandatory = false,
+ description = "probability that a negative example should be retained for training")
+ protected Float probabilityOfKeepingANegativeExample = 1f;
+
+ public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection";
@ConfigurationParameter(
- name = PARAM_FEATURE_TRIM_ORNOT,
- mandatory = false,
- description = "set whether feature selection is used or not")
- public static Float featureTrim = 0f;
-
+ name = PARAM_FEATURE_SELECTION_THRESHOLD,
+ mandatory = false,
+ description = "the Chi-squared threshold at which features should be removed")
+ protected Float featureSelectionThreshold = 0f;
+
+ public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+ @ConfigurationParameter(
+ mandatory = false,
+ name = PARAM_FEATURE_SELECTION_URI,
+ description = "provides a URI where the feature selection data will be written")
+ protected URI featureSelectionURI;
+
public static AnalysisEngineDescription createDataWriterDescription(
- String dataWriterName,
- File outputDirectory, float downratio, float featureSelect) throws ResourceInitializationException {
+ Class<?> dataWriter,
+ File outputDirectory,
+ float downratio,
+ float featureSelect) throws ResourceInitializationException {
return AnalysisEngineFactory.createPrimitiveDescription(
EventAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING,
true,
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- dataWriterName,
+ dataWriter,
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
outputDirectory,
EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
downratio,
- EventAnnotator.PARAM_FEATURE_TRIM_ORNOT,
+ EventAnnotator.PARAM_FEATURE_SELECTION_THRESHOLD,
featureSelect);
}
public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
throws ResourceInitializationException {
- AnalysisEngineDescription fsEventAnnotator =AnalysisEngineFactory.createPrimitiveDescription(
+ AnalysisEngineDescription fsEventAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
EventAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING,
false,
GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
new File(modelDirectory, "model.jar"));
- ConfigurationParameterFactory.addConfigurationParameter(
- fsEventAnnotator,
- EventAnnotator.PARAM_NB_FS_URI,
- EventAnnotator.createNbFSURI(modelDirectory) );
-
- return(fsEventAnnotator);
+ ConfigurationParameterFactory.addConfigurationParameter(
+ fsEventAnnotator,
+ EventAnnotator.PARAM_FEATURE_SELECTION_URI,
+ EventAnnotator.createFeatureSelectionURI(modelDirectory));
+
+ return (fsEventAnnotator);
}
protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
@@ -134,26 +135,16 @@ public class EventAnnotator extends Clea
private BIOChunking<BaseToken, EntityMention> entityChunking;
private BIOChunking<BaseToken, EventMention> eventChunking;
-
- public static final String PARAM_NB_FS_URI = ConfigurationParameterFactory.createConfigurationParameterName(
- EventAnnotator.class,
- "neighborFsUri");
-
- @ConfigurationParameter(
- mandatory = false,
- description = "provides a URI where the neighbor annotation's feature selection data will be written")
- protected URI neighborFsUri;
-
- public static final String FS_NEIGHBOR_EXTRACTOR_KEY = "SelectNeighborFeatures";
-
- private Chi2NeighborFSExtractor<String> chi2NeighborFsExtractor;
-
-
- public static URI createNbFSURI(File outputDirectoryName) {
- File f = new File(outputDirectoryName, FS_NEIGHBOR_EXTRACTOR_KEY + "_Chi2_extractor.dat");
- return f.toURI();
- }
- //*****feature selection related parameters
+
+ public static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
+ private Chi2NeighborFSExtractor<String> featureSelectionExtractor;
+
+ public static URI createFeatureSelectionURI(File outputDirectoryName) {
+ return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
+ }
+
+ // *****feature selection related parameters
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
@@ -167,74 +158,47 @@ public class EventAnnotator extends Clea
this.eventChunking = new BIOChunking<BaseToken, EventMention>(
BaseToken.class,
EventMention.class);
-
- //configure FS extractor:
- if (featureTrim > 0){//if feature selection
- CombinedExtractor forneighbors = new CombinedExtractor(
- new CoveredTextExtractor(),
- new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
- new TypePathExtractor(BaseToken.class, "partOfSpeech"),
- new SurfaceFormFeatureExtractor(),
- new PhraseExtractor(),
- new SRLExtractor());
-
- try {
- this.chi2NeighborFsExtractor = initNbFSExtractor(forneighbors);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }else{//if no feature selection
- // add features: word, stem, pos and more
- this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
- // try {
- this.tokenFeatureExtractors.addAll(Arrays.asList(
- new CoveredTextExtractor(),
-// new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-// new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
-// new SurfaceFormFeatureExtractor(),
-// new PhraseExtractor(),
- new SRLExtractor()));
- // new CoveredTextToValuesExtractor("ACF", StringToDoublesProcessor.parse("/word_freq.lst")),
- // new CoveredTextToValuesExtractor("PCA", StringToDoublesProcessor.parse("/word_pca.lst")),
- // new CoveredTextToValuesExtractor("TimPCA", StringToDoublesProcessor.parse("/tim_word_pca.txt"))));
-
- //add window of features before and after
- CombinedExtractor subExtractor = new CombinedExtractor(
- new CoveredTextExtractor(),
- new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
- new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
- new SurfaceFormFeatureExtractor(),
- new SRLExtractor());
-
- this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
- this.contextFeatureExtractors.add(new CleartkExtractor(
- BaseToken.class,
- subExtractor,
- new Preceding(3),
- new Following(3)));
- }
+ CombinedExtractor subExtractor = new CombinedExtractor(
+ new CoveredTextExtractor(),
+ new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+ new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+ new SurfaceFormFeatureExtractor(),
+ new PhraseExtractor(),
+ new SRLExtractor());
+
+ if (featureSelectionThreshold > 0) {
+ this.featureSelectionExtractor = new Chi2NeighborFSExtractor<String>(
+ EventAnnotator.FEATURE_SELECTION_NAME,
+ BaseToken.class,
+ subExtractor,
+ this.featureSelectionThreshold,
+ new Preceding(4),
+ new Following(4));
+ if (this.featureSelectionURI != null) {
+ try {
+ this.featureSelectionExtractor.load(this.featureSelectionURI);
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+ }
+ } else {
+ this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
+ this.tokenFeatureExtractors.add(subExtractor);
+ this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
+ this.contextFeatureExtractors.add(new CleartkExtractor(
+ BaseToken.class,
+ subExtractor,
+ new Preceding(3),
+ new Following(3)));
+ }
}
-
-private Chi2NeighborFSExtractor<String> initNbFSExtractor(
- CombinedExtractor subextractor) throws IOException{
-
- Chi2NeighborFSExtractor<String> chi2NbFSExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, BaseToken.class, subextractor, featureTrim, new Preceding(4),
- new Following(4)); //the 3rd last parameter is used to control chi2 threshold, the last two are used to control window size
-
- if (this.neighborFsUri != null) {
- chi2NbFSExtractor.load(this.neighborFsUri);
- }
- return chi2NbFSExtractor;
-}
-
-
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
-
- Random rand = new Random();
+
+ Random rand = new Random();
// classify tokens within each sentence
for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
@@ -270,54 +234,62 @@ private Chi2NeighborFSExtractor<String>
int tokenIndex = -1;
int window = 2;
int nPreviousClassifications = 2;
-
+
for (BaseToken token : tokens) {
++tokenIndex;
List<Feature> features = new ArrayList<Feature>();
-
- if (featureTrim >0 ){//if feature selection
- features.addAll(this.chi2NeighborFsExtractor.extract(jCas, token)); //base features
- features.addAll(this.chi2NeighborFsExtractor.extractWithin(jCas, token, sentence)); //neighbor features
- features.addAll(this.chi2NeighborFsExtractor.extract(entityTypeIDs, entityTagsByType,tokenIndex, window)); // features from surrounding entities
- features.addAll(this.chi2NeighborFsExtractor.extract(nPreviousClassifications, tokenIndex, outcomes)); //features from previous classifications
- }else{ //if no feature selection
- // features from token attributes
- for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
- features.addAll(extractor.extract(jCas, token));
- }
- // features from surrounding tokens
- for (CleartkExtractor extractor : this.contextFeatureExtractors) {
- features.addAll(extractor.extractWithin(jCas, token, sentence));
- }
- // features from surrounding entities
- for (int typeID : entityTypeIDs) {
- List<String> tokenEntityTags = entityTagsByType.get(typeID);
- int begin = Math.max(tokenIndex - window, 0);
- int end = Math.min(tokenIndex + window, tokenEntityTags.size());
- for (int i = begin; i < end; ++i) {
- String name = String.format("EntityTag_%d_%d", typeID, i - begin);
- features.add(new Feature(name, tokenEntityTags.get(i)));
- }
- }
- // features from previous classifications
- for (int i = nPreviousClassifications; i > 0; --i) {
- int index = tokenIndex - i;
- String previousOutcome = index < 0 ? "O" : outcomes.get(index);
- features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+
+ if (featureSelectionThreshold > 0) {// if feature selection
+ features.addAll(this.featureSelectionExtractor.extract(jCas, token)); // base features
+ features.addAll(this.featureSelectionExtractor.extractWithin(jCas, token, sentence)); // neighbor
+ // features
+ features.addAll(this.featureSelectionExtractor.extract(
+ entityTypeIDs,
+ entityTagsByType,
+ tokenIndex,
+ window)); // features from surrounding entities
+ features.addAll(this.featureSelectionExtractor.extract(
+ nPreviousClassifications,
+ tokenIndex,
+ outcomes)); // features from previous classifications
+ } else { // if no feature selection
+ // features from token attributes
+ for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
+ features.addAll(extractor.extract(jCas, token));
+ }
+ // features from surrounding tokens
+ for (CleartkExtractor extractor : this.contextFeatureExtractors) {
+ features.addAll(extractor.extractWithin(jCas, token, sentence));
+ }
+ // features from surrounding entities
+ for (int typeID : entityTypeIDs) {
+ List<String> tokenEntityTags = entityTagsByType.get(typeID);
+ int begin = Math.max(tokenIndex - window, 0);
+ int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+ for (int i = begin; i < end; ++i) {
+ String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+ features.add(new Feature(name, tokenEntityTags.get(i)));
}
+ }
+ // features from previous classifications
+ for (int i = nPreviousClassifications; i > 0; --i) {
+ int index = tokenIndex - i;
+ String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+ features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+ }
}
-
+
// if training, write to data file
if (this.isTraining()) {
- String outcome = outcomes.get(tokenIndex);
- if(outcome.equals("O")){ //if it is an "O". downsample it
- if (rand.nextDouble()<=probabilityOfKeepingANegativeExample)
- this.dataWriter.write(new Instance<String>(outcome, features));
- }else {
- this.dataWriter.write(new Instance<String>(outcome, features));
- }
+ String outcome = outcomes.get(tokenIndex);
+ if (outcome.equals("O")) { // if it is an "O". downsample it
+ if (rand.nextDouble() <= probabilityOfKeepingANegativeExample)
+ this.dataWriter.write(new Instance<String>(outcome, features));
+ } else {
+ this.dataWriter.write(new Instance<String>(outcome, features));
}
+ }
// if predicting, add prediction to outcomes
else {
@@ -334,47 +306,14 @@ private Chi2NeighborFSExtractor<String>
private static Predicate<EntityMention> hasEntityType(final int typeID) {
return new Predicate<EntityMention>() {
+ @Override
public boolean apply(EntityMention mention) {
return mention.getTypeID() == typeID;
}
};
}
-// private static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
-// private Logger logger = Logger.getLogger(this.getClass().getName());
-//
-// private Map<String, double[]> result = new HashMap<String, double[]>();
-//
-// private int length = -1;
-//
-// @Override
-// public Map<String, double[]> getResult() {
-// return this.result;
-// }
-//
-// @Override
-// public boolean processLine(String line) throws IOException {
-// String[] parts = line.trim().split(",");
-// String key = parts[0];
-// int partsOffset = 0;
-// if (this.length == -1) {
-// this.length = parts.length;
-// } else if (parts.length != this.length) {
-// String message = "expected %d parts, found %d, skipping line '%s'";
-// this.logger.warning(String.format(message, this.length, parts.length, line));
-// return true;
-// }
-// double[] values = new double[parts.length - 1];
-// for (int i = 0; i < values.length; ++i) {
-// values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
-// }
-// this.result.put(key, values);
-// return true;
-// }
-// }
-
-
-public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
- return this.chi2NeighborFsExtractor;
-}
+ public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
+ return this.featureSelectionExtractor;
+ }
}
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java Wed Dec 19 23:14:55 2012
@@ -18,9 +18,14 @@
*/
package org.apache.ctakes.temporal.ae.feature;
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.logging.Logger;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
@@ -28,6 +33,9 @@ import org.cleartk.classifier.Feature;
import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import com.google.common.io.Files;
+import com.google.common.io.LineProcessor;
+
public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
private String name;
@@ -35,6 +43,43 @@ public class CoveredTextToValuesExtracto
private Map<String, double[]> textDoublesMap;
private double[] meanValues;
+
+ public static Map<String, double[]> parseTextDoublesMap(File file, Charset charset) throws IOException {
+ return Files.readLines(file, charset, new StringToDoublesProcessor());
+ }
+
+ static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
+ private Logger logger = Logger.getLogger(this.getClass().getName());
+
+ private Map<String, double[]> result = new HashMap<String, double[]>();
+
+ private int length = -1;
+
+ @Override
+ public Map<String, double[]> getResult() {
+ return this.result;
+ }
+
+ @Override
+ public boolean processLine(String line) throws IOException {
+ String[] parts = line.trim().split(",");
+ String key = parts[0];
+ int partsOffset = 0;
+ if (this.length == -1) {
+ this.length = parts.length;
+ } else if (parts.length != this.length) {
+ String message = "expected %d parts, found %d, skipping line '%s'";
+ this.logger.warning(String.format(message, this.length, parts.length, line));
+ return true;
+ }
+ double[] values = new double[parts.length - 1];
+ for (int i = 0; i < values.length; ++i) {
+ values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
+ }
+ this.result.put(key, values);
+ return true;
+ }
+ }
public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
super();
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Wed Dec 19 23:14:55 2012
@@ -20,7 +20,6 @@ package org.apache.ctakes.temporal.eval;
import java.io.File;
import java.io.IOException;
-import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
@@ -32,17 +31,12 @@ import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
-import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.transform.InstanceStream;
-import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
import org.cleartk.eval.AnnotationStatistics;
import org.cleartk.util.ViewURIUtil;
import org.uimafit.factory.AggregateBuilder;
@@ -92,25 +86,6 @@ public abstract class EvaluationOfAnnota
aggregateBuilder.add(this.getPreprocessorTrainDescription());
aggregateBuilder.add(this.getDataWriterDescription(directory));
SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
-
- if( EventAnnotator.featureTrim > 0 ){
- //Extracting features and writing instances
- Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
- // Collect MinMax stats for feature normalization
- URI chi2NbFsURI = EventAnnotator.createNbFSURI(directory);
- Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, EventAnnotator.featureTrim);
- chi2NbFsExtractor.train(instances);
- chi2NbFsExtractor.save(chi2NbFsURI);
- //now write in the libsvm format
- this.logger.info("Write out model training data");
- LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
- for (Instance<String> instance : instances) {
- instance = chi2NbFsExtractor.transform(instance);
- dataWriter.write(instance);
- }
- dataWriter.finish();
- }
-
this.trainAndPackage(directory);
}
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Wed Dec 19 23:14:55 2012
@@ -19,29 +19,46 @@
package org.apache.ctakes.temporal.eval;
import java.io.File;
+import java.net.URI;
import java.util.Collection;
import java.util.EnumSet;
import java.util.List;
import java.util.logging.Level;
import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.TOP;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Instance;
import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.feature.transform.InstanceStream;
import org.cleartk.classifier.jar.JarClassifierBuilder;
import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
import org.cleartk.eval.AnnotationStatistics;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.pipeline.SimplePipeline;
import org.uimafit.util.JCasUtil;
import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
public class EvaluationOfEventSpans extends EvaluationOfAnnotationSpans_ImplBase {
+ static interface Options extends Evaluation_ImplBase.Options {
+
+ @Option(longName = "downratio", defaultValue = "1")
+ public float getProbabilityOfKeepingANegativeExample();
+
+ @Option(longName = "featureSelectionThreshold", defaultValue = "0")
+ public float getFeatureSelectionThreshold();
+ }
+
public static void main(String[] args) throws Exception {
Options options = CliFactory.parseArguments(Options.class, args);
EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
@@ -49,57 +66,78 @@ public class EvaluationOfEventSpans exte
options.getRawTextDirectory(),
options.getKnowtatorXMLDirectory(),
options.getPatients().getList(),
- options.getDownSampleRatio(),
- options.getFeatureSelect()); //control apply feature selection or not
+ options.getProbabilityOfKeepingANegativeExample(),
+ options.getFeatureSelectionThreshold());
evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
- List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(4);
+ List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(2);
for (AnnotationStatistics<String> stats : foldStats) {
System.err.println(stats);
}
System.err.println("OVERALL");
System.err.println(AnnotationStatistics.addAll(foldStats));
}
-
- private float downratio;
- private float featureTrim;
+
+ private float probabilityOfKeepingANegativeExample;
+
+ private float featureSelectionThreshold;
public EvaluationOfEventSpans(
File baseDirectory,
File rawTextDirectory,
File knowtatorXMLDirectory,
List<Integer> patientSets,
- float downratio, float featureSelect) {
- super(
- baseDirectory,
- rawTextDirectory,
- knowtatorXMLDirectory,
- patientSets,
- EnumSet.of(AnnotatorType.PART_OF_SPEECH_TAGS,
+ float probabilityOfKeepingANegativeExample,
+ float featureSelectionThreshold) {
+ super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, patientSets, EnumSet.of(
+ AnnotatorType.PART_OF_SPEECH_TAGS));
//AnnotatorType.UMLS_NAMED_ENTITIES,
-// AnnotatorType.LEXICAL_VARIANTS,
- AnnotatorType.DEPENDENCIES,
- AnnotatorType.SEMANTIC_ROLES));
- this.downratio = downratio;
- this.featureTrim = featureSelect;
+ //AnnotatorType.LEXICAL_VARIANTS,
+ //AnnotatorType.DEPENDENCIES,
+ //AnnotatorType.SEMANTIC_ROLES));
+ this.probabilityOfKeepingANegativeExample = probabilityOfKeepingANegativeExample;
+ this.featureSelectionThreshold = featureSelectionThreshold;
}
@Override
protected AnalysisEngineDescription getDataWriterDescription(File directory)
throws ResourceInitializationException {
- if(this.featureTrim > 0){
- return EventAnnotator.createDataWriterDescription(
- InstanceDataWriter.class.getName(),
- directory,
- this.downratio,
- this.featureTrim);
- }
- return EventAnnotator.createDataWriterDescription(
- LIBSVMStringOutcomeDataWriter.class.getName(),
- directory,
- this.downratio,
- this.featureTrim);
-
-
+ Class<?> dataWriterClass = this.featureSelectionThreshold > 0f
+ ? InstanceDataWriter.class
+ : LIBSVMStringOutcomeDataWriter.class;
+ return EventAnnotator.createDataWriterDescription(
+ dataWriterClass,
+ directory,
+ this.probabilityOfKeepingANegativeExample,
+ this.featureSelectionThreshold);
+ }
+
+ @Override
+ protected void train(CollectionReader collectionReader, File directory) throws Exception {
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ aggregateBuilder.add(this.getPreprocessorTrainDescription());
+ aggregateBuilder.add(this.getDataWriterDescription(directory));
+ SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+
+ if (this.featureSelectionThreshold > 0) {
+ // Extracting features and writing instances
+ Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+ // Collect MinMax stats for feature normalization
+ URI chi2NbFsURI = EventAnnotator.createFeatureSelectionURI(directory);
+ Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(
+ EventAnnotator.FEATURE_SELECTION_NAME,
+ this.featureSelectionThreshold);
+ chi2NbFsExtractor.train(instances);
+ chi2NbFsExtractor.save(chi2NbFsURI);
+ // now write in the libsvm format
+ LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
+ for (Instance<String> instance : instances) {
+ instance = chi2NbFsExtractor.transform(instance);
+ dataWriter.write(instance);
+ }
+ dataWriter.finish();
+ }
+
+ this.trainAndPackage(directory);
}
@Override
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1424215&r1=1424214&r2=1424215&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Wed Dec 19 23:14:55 2012
@@ -84,12 +84,6 @@ public abstract class Evaluation_ImplBas
@Option(longName = "patients")
public CommandLine.IntegerRanges getPatients();
-
- @Option(longName = "downratio", defaultValue="1")
- public float getDownSampleRatio();
-
- @Option(longName = "featureSelect", defaultValue="0")
- public float getFeatureSelect(); //get feature selection cut off threshold is it is > 0. apply no FS if featureSelect == 0
}
protected File rawTextDirectory;