You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/06/20 23:20:28 UTC

svn commit: r1495191 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java

Author: tmill
Date: Thu Jun 20 21:20:27 2013
New Revision: 1495191

URL: http://svn.apache.org/r1495191
Log:
Added backwards BIO tagging TIMEX detector.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java   (with props)

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java?rev=1495191&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java Thu Jun 20 21:20:27 2013
@@ -0,0 +1,149 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.temporal.ae.feature.ParseSpanFeatureExtractor;
+import org.apache.ctakes.temporal.ae.feature.TimeWordTypeExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.chunking.BIOChunking;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.collect.Lists;
+
+
+public class BackwardsTimeAnnotator extends TemporalEntityAnnotator_ImplBase{
+
+  protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
+
+  protected List<CleartkExtractor> contextFeatureExtractors;
+  
+//  protected List<SimpleFeatureExtractor> parseFeatureExtractors;
+  protected ParseSpanFeatureExtractor parseExtractor;
+  
+  private BIOChunking<BaseToken, TimeMention> timeChunking;
+
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
+
+    // define chunking
+    this.timeChunking = new BIOChunking<BaseToken, TimeMention>(BaseToken.class, TimeMention.class);
+
+    CombinedExtractor allExtractors = new CombinedExtractor(
+        new CoveredTextExtractor(),
+        new CharacterCategoryPatternExtractor(PatternType.REPEATS_MERGED),
+        new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+        new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+        new TimeWordTypeExtractor());
+
+//    CombinedExtractor parseExtractors = new CombinedExtractor(
+//        new ParseSpanFeatureExtractor()
+//        );
+    this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
+    this.tokenFeatureExtractors.add(allExtractors);
+
+    this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
+    this.contextFeatureExtractors.add(new CleartkExtractor(
+        BaseToken.class,
+        allExtractors,
+        new Preceding(3),
+        new Following(3)));
+//    this.parseFeatureExtractors = new ArrayList<ParseSpanFeatureExtractor>();
+//    this.parseFeatureExtractors.add(new ParseSpanFeatureExtractor());
+    parseExtractor = new ParseSpanFeatureExtractor();
+  }
+
+  @Override
+  public void process(JCas jCas, Segment segment) throws AnalysisEngineProcessException {
+    // classify tokens within each sentence
+    for (Sentence sentence : JCasUtil.selectCovered(jCas, Sentence.class, segment)) {
+      List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
+      
+      // during training, the list of all outcomes for the tokens
+      List<String> outcomes;
+      if (this.isTraining()) {
+        List<TimeMention> times = JCasUtil.selectCovered(jCas, TimeMention.class, sentence);
+        outcomes = this.timeChunking.createOutcomes(jCas, tokens, times);
+        outcomes = Lists.reverse(outcomes);
+      }
+      // during prediction, the list of outcomes predicted so far
+      else {
+        outcomes = new ArrayList<String>();
+      }
+
+      tokens = Lists.reverse(tokens);
+
+      // extract features for all tokens
+      int tokenIndex = -1;
+      for (BaseToken token : tokens) {
+        ++tokenIndex;
+
+        List<Feature> features = new ArrayList<Feature>();
+        // features from token attributes
+        for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
+          features.addAll(extractor.extract(jCas, token));
+        }
+        // features from surrounding tokens
+        for (CleartkExtractor extractor : this.contextFeatureExtractors) {
+          features.addAll(extractor.extractWithin(jCas, token, sentence));
+        }
+        // features from previous classifications
+        int nPreviousClassifications = 2;
+        for (int i = nPreviousClassifications; i > 0; --i) {
+          int index = tokenIndex - i;
+          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+          features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+        }
+        // features from dominating parse tree
+//        for(SimpleFeatureExtractor extractor : this.parseFeatureExtractors){
+        BaseToken startToken = token;
+        for(int i = tokenIndex-1; i >= 0; --i){
+          String outcome = outcomes.get(i);
+          if(outcome.equals("O")){
+            break;
+          }
+          startToken = tokens.get(i);
+        }
+        features.addAll(parseExtractor.extract(jCas, startToken.getBegin(), token.getEnd()));
+//        }
+        // if training, write to data file
+        if (this.isTraining()) {
+          String outcome = outcomes.get(tokenIndex);
+          this.dataWriter.write(new Instance<String>(outcome, features));
+        }
+
+        // if predicting, add prediction to outcomes
+        else {
+          outcomes.add(this.classifier.classify(features));
+        }
+      }
+
+      // during prediction, convert chunk labels to times and add them to the CAS
+      if (!this.isTraining()) {
+        tokens = Lists.reverse(tokens);
+        outcomes = Lists.reverse(outcomes);
+        this.timeChunking.createChunks(jCas, tokens, outcomes);
+      }
+    }
+  }
+
+}

Propchange: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/BackwardsTimeAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain