You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2013/07/25 17:57:15 UTC

svn commit: r1507033 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/EventAnnotator.java ae/TimeAnnotator.java eval/EvaluationOfTimeSpans.java

Author: clin
Date: Thu Jul 25 15:57:15 2013
New Revision: 1507033

URL: http://svn.apache.org/r1507033
Log:
add SMOTE for time annotator

Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Jul 25 15:57:15 2013
@@ -88,12 +88,12 @@ public class EventAnnotator extends Temp
   protected Float featureSelectionThreshold = 1f; //default is not using feature selection, i.e. select 100% of all features.
   
   public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
-  
+
   @ConfigurationParameter(
-	      name = PARAM_SMOTE_NUM_NEIGHBORS,
-	      mandatory = false,
-	      description = "the number of neighbors used for minority instances for SMOTE algorithm")
-	  protected Float smoteNumOfNeighbors = 0f;
+		  name = PARAM_SMOTE_NUM_NEIGHBORS,
+		  mandatory = false,
+		  description = "the number of neighbors used for minority instances for SMOTE algorithm")
+  protected Float smoteNumOfNeighbors = 0f;
 
   public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
 

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/TimeAnnotator.java Thu Jul 25 15:57:15 2013
@@ -28,6 +28,7 @@ import org.apache.ctakes.temporal.ae.fea
 import org.apache.ctakes.temporal.ae.feature.TimeWordTypeExtractor;
 import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
 import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
+import org.apache.ctakes.temporal.utils.SMOTEplus;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
@@ -67,7 +68,7 @@ public class TimeAnnotator extends Tempo
 			name = PARAM_FEATURE_SELECTION_THRESHOLD,
 			mandatory = false,
 			description = "the Chi-squared threshold at which features should be removed")
-	protected Float featureSelectionThreshold = 0f;
+	protected Float featureSelectionThreshold = 1f;
 	
 	public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
 
@@ -76,13 +77,22 @@ public class TimeAnnotator extends Tempo
 			name = PARAM_FEATURE_SELECTION_URI,
 			description = "provides a URI where the feature selection data will be written")
 	protected URI featureSelectionURI;
+	
+	public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
+
+	  @ConfigurationParameter(
+			  name = PARAM_SMOTE_NUM_NEIGHBORS,
+			  mandatory = false,
+			  description = "the number of neighbors used for minority instances for SMOTE algorithm")
+	  protected Float smoteNumOfNeighbors = 0f;
 
 	public static final String TIMEX_VIEW = "TimexView";
 
 	public static AnalysisEngineDescription createDataWriterDescription(
 			Class<?> dataWriterClass,
 					File outputDirectory,
-					float featureSelect) throws ResourceInitializationException {
+					float featureSelect,
+					float smoteNeighborNumber) throws ResourceInitializationException {
 		return AnalysisEngineFactory.createPrimitiveDescription(
 				TimeAnnotator.class,
 				CleartkAnnotator.PARAM_IS_TRAINING,
@@ -92,7 +102,9 @@ public class TimeAnnotator extends Tempo
 				DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
 				outputDirectory,
 				TimeAnnotator.PARAM_FEATURE_SELECTION_THRESHOLD,
-		        featureSelect);
+		        featureSelect,
+		        EventAnnotator.PARAM_SMOTE_NUM_NEIGHBORS,
+		        smoteNeighborNumber);
 	}
 
 	public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
@@ -121,7 +133,7 @@ public class TimeAnnotator extends Tempo
 	private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
 
 	public static FeatureSelection<String> createFeatureSelection(double threshold) {
-		return new Chi2FeatureSelection<String>(TimeAnnotator.FEATURE_SELECTION_NAME, threshold);
+		return new Chi2FeatureSelection<String>(TimeAnnotator.FEATURE_SELECTION_NAME, threshold, true);
 	}
 	
 	public static URI createFeatureSelectionURI(File outputDirectoryName) {
@@ -159,7 +171,7 @@ public class TimeAnnotator extends Tempo
 		parseExtractor = new ParseSpanFeatureExtractor();
 
 		//initialize feature selection
-		if (featureSelectionThreshold == 0) {
+		if (featureSelectionThreshold == 1) {
 			this.featureSelection = null;
 		} else {
 			this.featureSelection = TimeAnnotator.createFeatureSelection(this.featureSelectionThreshold);
@@ -176,10 +188,13 @@ public class TimeAnnotator extends Tempo
 
 	@Override
 	public void process(JCas jCas, Segment segment) throws AnalysisEngineProcessException {
+		//TRY SMOTE algorithm here to generate more minority class samples
+	    SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
+	    
 		// classify tokens within each sentence
 		for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
 			List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
-
+			
 			// during training, the list of all outcomes for the tokens
 			List<String> outcomes;
 			if (this.isTraining()) {
@@ -234,12 +249,20 @@ public class TimeAnnotator extends Tempo
 		        }
 				
 				// if training, write to data file
-				if (this.isTraining()) {
-					String outcome = outcomes.get(tokenIndex);
-					this.dataWriter.write(new Instance<String>(outcome, features));
-				}else {// if predicting, add prediction to outcomes
-					outcomes.add(this.classifier.classify(features));
-				}
+		        if (this.isTraining()) {
+		        	String outcome = outcomes.get(tokenIndex);
+		        	// if it is an "O" down-sample it
+		        	if (outcome.equals("O")) {
+		        		this.dataWriter.write(new Instance<String>(outcome, features));
+
+		        	}else{//for minority instances:
+		        		Instance<String> minorityInst = new Instance<String>(outcome, features);
+		        		this.dataWriter.write(minorityInst);
+		        		smote.addInstance(minorityInst);//add minority instances to SMOTE algorithm
+		        	}
+		        }else {// if predicting, add prediction to outcomes
+		        	outcomes.add(this.classifier.classify(features));
+		        }
 			}
 
 			// during prediction, convert chunk labels to times and add them to the CAS
@@ -253,5 +276,11 @@ public class TimeAnnotator extends Tempo
 				this.timeChunking.createChunks(timexCas, tokens, outcomes);
 			}
 		}
+		if(this.isTraining() && this.smoteNumOfNeighbors >= 1){ //add synthetic instances to datawriter, if smote is selected
+	    	Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
+	    	for( Instance<String> sytheticInst: syntheticInsts){
+	    		this.dataWriter.write(sytheticInst);
+	    	}
+	    }
 	}
 }

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1507033&r1=1507032&r2=1507033&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Thu Jul 25 15:57:15 2013
@@ -64,8 +64,11 @@ public class EvaluationOfTimeSpans exten
 
 	static interface Options extends Evaluation_ImplBase.Options {
 
-		@Option(longName = "featureSelectionThreshold", defaultValue = "0")
+		@Option(longName = "featureSelectionThreshold", defaultValue = "1")
 		public float getFeatureSelectionThreshold();
+		
+		@Option(longName = "SMOTENeighborNumber", defaultValue = "0")
+	    public float getSMOTENeighborNumber();
 	}
 
 	public static void main(String[] args) throws Exception {
@@ -99,6 +102,7 @@ public class EvaluationOfTimeSpans exten
 					options.getXMIDirectory(),
 					options.getTreebankDirectory(),
 					options.getFeatureSelectionThreshold(),
+					options.getSMOTENeighborNumber(),
 					annotatorClass,
 					options.getPrintOverlappingSpans(),
 					annotatorTrainingArguments.get(annotatorClass));
@@ -131,6 +135,8 @@ public class EvaluationOfTimeSpans exten
 	private String[] trainingArguments;
 	
 	private float featureSelectionThreshold;
+	
+	private float smoteNeighborNumber;
 
 	public EvaluationOfTimeSpans(
 			File baseDirectory,
@@ -140,6 +146,7 @@ public class EvaluationOfTimeSpans exten
 			File xmiDirectory,
 			File treebankDirectory,
 			float featureSelectionThreshold,
+			float numOfSmoteNeighbors,
 			Class<? extends JCasAnnotator_ImplBase> annotatorClass,
 					boolean printOverlapping,
 					String[] trainingArguments) {
@@ -148,6 +155,7 @@ public class EvaluationOfTimeSpans exten
 		this.featureSelectionThreshold = featureSelectionThreshold;
 		this.trainingArguments = trainingArguments;
 		this.printOverlapping = printOverlapping;
+		this.smoteNeighborNumber = numOfSmoteNeighbors;
 	}
 
 	@Override
@@ -164,7 +172,8 @@ public class EvaluationOfTimeSpans exten
 				return TimeAnnotator.createDataWriterDescription(
 						dataWriterClass,
 						this.getModelDirectory(directory),
-						this.featureSelectionThreshold);
+						this.featureSelectionThreshold,
+						this.smoteNeighborNumber);
 			}
 			return AnalysisEngineFactory.createPrimitiveDescription(
 					this.annotatorClass,