You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2013/07/25 17:57:15 UTC
svn commit: r1507033 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/EventAnnotator.java ae/TimeAnnotator.java eval/EvaluationOfTimeSpans.java
Author: clin
Date: Thu Jul 25 15:57:15 2013
New Revision: 1507033
URL: http://svn.apache.org/r1507033
Log:
add SMOTE for time annotator
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Jul 25 15:57:15 2013
@@ -88,12 +88,12 @@ public class EventAnnotator extends Temp
protected Float featureSelectionThreshold = 1f; //default is not using feature selection, i.e. select 100% of all features.
public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
-
+
@ConfigurationParameter(
- name = PARAM_SMOTE_NUM_NEIGHBORS,
- mandatory = false,
- description = "the number of neighbors used for minority instances for SMOTE algorithm")
- protected Float smoteNumOfNeighbors = 0f;
+ name = PARAM_SMOTE_NUM_NEIGHBORS,
+ mandatory = false,
+ description = "the number of neighbors used for minority instances for SMOTE algorithm")
+ protected Float smoteNumOfNeighbors = 0f;
public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java Thu Jul 25 15:57:15 2013
@@ -28,6 +28,7 @@ import org.apache.ctakes.temporal.ae.fea
import org.apache.ctakes.temporal.ae.feature.TimeWordTypeExtractor;
import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
+import org.apache.ctakes.temporal.utils.SMOTEplus;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.ctakes.typesystem.type.textspan.Segment;
@@ -67,7 +68,7 @@ public class TimeAnnotator extends Tempo
name = PARAM_FEATURE_SELECTION_THRESHOLD,
mandatory = false,
description = "the Chi-squared threshold at which features should be removed")
- protected Float featureSelectionThreshold = 0f;
+ protected Float featureSelectionThreshold = 1f;
public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
@@ -76,13 +77,22 @@ public class TimeAnnotator extends Tempo
name = PARAM_FEATURE_SELECTION_URI,
description = "provides a URI where the feature selection data will be written")
protected URI featureSelectionURI;
+
+ public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
+
+ @ConfigurationParameter(
+ name = PARAM_SMOTE_NUM_NEIGHBORS,
+ mandatory = false,
+ description = "the number of neighbors used for minority instances for SMOTE algorithm")
+ protected Float smoteNumOfNeighbors = 0f;
public static final String TIMEX_VIEW = "TimexView";
public static AnalysisEngineDescription createDataWriterDescription(
Class<?> dataWriterClass,
File outputDirectory,
- float featureSelect) throws ResourceInitializationException {
+ float featureSelect,
+ float smoteNeighborNumber) throws ResourceInitializationException {
return AnalysisEngineFactory.createPrimitiveDescription(
TimeAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING,
@@ -92,7 +102,9 @@ public class TimeAnnotator extends Tempo
DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
outputDirectory,
TimeAnnotator.PARAM_FEATURE_SELECTION_THRESHOLD,
- featureSelect);
+ featureSelect,
+ EventAnnotator.PARAM_SMOTE_NUM_NEIGHBORS,
+ smoteNeighborNumber);
}
public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
@@ -121,7 +133,7 @@ public class TimeAnnotator extends Tempo
private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
public static FeatureSelection<String> createFeatureSelection(double threshold) {
- return new Chi2FeatureSelection<String>(TimeAnnotator.FEATURE_SELECTION_NAME, threshold);
+ return new Chi2FeatureSelection<String>(TimeAnnotator.FEATURE_SELECTION_NAME, threshold, true);
}
public static URI createFeatureSelectionURI(File outputDirectoryName) {
@@ -159,7 +171,7 @@ public class TimeAnnotator extends Tempo
parseExtractor = new ParseSpanFeatureExtractor();
//initialize feature selection
- if (featureSelectionThreshold == 0) {
+ if (featureSelectionThreshold == 1) {
this.featureSelection = null;
} else {
this.featureSelection = TimeAnnotator.createFeatureSelection(this.featureSelectionThreshold);
@@ -176,10 +188,13 @@ public class TimeAnnotator extends Tempo
@Override
public void process(JCas jCas, Segment segment) throws AnalysisEngineProcessException {
+ //TRY SMOTE algorithm here to generate more minority class samples
+ SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
+
// classify tokens within each sentence
for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
-
+
// during training, the list of all outcomes for the tokens
List<String> outcomes;
if (this.isTraining()) {
@@ -234,12 +249,20 @@ public class TimeAnnotator extends Tempo
}
// if training, write to data file
- if (this.isTraining()) {
- String outcome = outcomes.get(tokenIndex);
- this.dataWriter.write(new Instance<String>(outcome, features));
- }else {// if predicting, add prediction to outcomes
- outcomes.add(this.classifier.classify(features));
- }
+ if (this.isTraining()) {
+ String outcome = outcomes.get(tokenIndex);
+ // if it is an "O" down-sample it
+ if (outcome.equals("O")) {
+ this.dataWriter.write(new Instance<String>(outcome, features));
+
+ }else{//for minority instances:
+ Instance<String> minorityInst = new Instance<String>(outcome, features);
+ this.dataWriter.write(minorityInst);
+ smote.addInstance(minorityInst);//add minority instances to SMOTE algorithm
+ }
+ }else {// if predicting, add prediction to outcomes
+ outcomes.add(this.classifier.classify(features));
+ }
}
// during prediction, convert chunk labels to times and add them to the CAS
@@ -253,5 +276,11 @@ public class TimeAnnotator extends Tempo
this.timeChunking.createChunks(timexCas, tokens, outcomes);
}
}
+ if(this.isTraining() && this.smoteNumOfNeighbors >= 1){ //add synthetic instances to datawriter, if smote is selected
+ Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
+ for( Instance<String> sytheticInst: syntheticInsts){
+ this.dataWriter.write(sytheticInst);
+ }
+ }
}
}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Thu Jul 25 15:57:15 2013
@@ -64,8 +64,11 @@ public class EvaluationOfTimeSpans exten
static interface Options extends Evaluation_ImplBase.Options {
- @Option(longName = "featureSelectionThreshold", defaultValue = "0")
+ @Option(longName = "featureSelectionThreshold", defaultValue = "1")
public float getFeatureSelectionThreshold();
+
+ @Option(longName = "SMOTENeighborNumber", defaultValue = "0")
+ public float getSMOTENeighborNumber();
}
public static void main(String[] args) throws Exception {
@@ -99,6 +102,7 @@ public class EvaluationOfTimeSpans exten
options.getXMIDirectory(),
options.getTreebankDirectory(),
options.getFeatureSelectionThreshold(),
+ options.getSMOTENeighborNumber(),
annotatorClass,
options.getPrintOverlappingSpans(),
annotatorTrainingArguments.get(annotatorClass));
@@ -131,6 +135,8 @@ public class EvaluationOfTimeSpans exten
private String[] trainingArguments;
private float featureSelectionThreshold;
+
+ private float smoteNeighborNumber;
public EvaluationOfTimeSpans(
File baseDirectory,
@@ -140,6 +146,7 @@ public class EvaluationOfTimeSpans exten
File xmiDirectory,
File treebankDirectory,
float featureSelectionThreshold,
+ float numOfSmoteNeighbors,
Class<? extends JCasAnnotator_ImplBase> annotatorClass,
boolean printOverlapping,
String[] trainingArguments) {
@@ -148,6 +155,7 @@ public class EvaluationOfTimeSpans exten
this.featureSelectionThreshold = featureSelectionThreshold;
this.trainingArguments = trainingArguments;
this.printOverlapping = printOverlapping;
+ this.smoteNeighborNumber = numOfSmoteNeighbors;
}
@Override
@@ -164,7 +172,8 @@ public class EvaluationOfTimeSpans exten
return TimeAnnotator.createDataWriterDescription(
dataWriterClass,
this.getModelDirectory(directory),
- this.featureSelectionThreshold);
+ this.featureSelectionThreshold,
+ this.smoteNeighborNumber);
}
return AnalysisEngineFactory.createPrimitiveDescription(
this.annotatorClass,