You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/21 03:44:18 UTC

svn commit: r1424790 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/ eval/

Author: stevenbethard
Date: Fri Dec 21 02:44:18 2012
New Revision: 1424790

URL: http://svn.apache.org/viewvc?rev=1424790&view=rev
Log:
Gets rid of PhraseExtractor (which was broken code anyway) and abstracts a ChunkingExtractor out of the entity-chunk feature extraction code to replace it.

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java   (with props)
Removed:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Fri Dec 21 02:44:18 2012
@@ -23,17 +23,16 @@ import java.io.File;
 import java.io.IOException;
 import java.net.URI;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.Random;
 
-import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
+import org.apache.ctakes.temporal.ae.feature.ChunkingExtractor;
 import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
 import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
 import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
@@ -127,10 +126,12 @@ public class EventAnnotator extends Clea
 
   private BIOChunking<BaseToken, EventMention> eventChunking;
 
+  private BIOChunking<BaseToken, Chunk> phraseChunking;
+
   protected SimpleFeatureExtractor tokenFeatureExtractor;
 
   protected CleartkExtractor contextFeatureExtractor;
-
+  
   private FeatureSelection<String> featureSelection;
 
   private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
@@ -152,6 +153,10 @@ public class EventAnnotator extends Clea
         BaseToken.class,
         EntityMention.class,
         "typeID");
+    this.phraseChunking = new BIOChunking<BaseToken, Chunk>(
+        BaseToken.class,
+        Chunk.class,
+        "chunkType");
     this.eventChunking = new BIOChunking<BaseToken, EventMention>(
         BaseToken.class,
         EventMention.class);
@@ -160,7 +165,6 @@ public class EventAnnotator extends Clea
         new CoveredTextExtractor(),
         new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
         new TypePathExtractor(BaseToken.class, "partOfSpeech"),
-        new PhraseExtractor(),
         new SRLExtractor());
     this.contextFeatureExtractor = new CleartkExtractor(
         BaseToken.class,
@@ -211,16 +215,22 @@ public class EventAnnotator extends Clea
           CONST.NE_TYPE_ID_PROCEDURE,
           CONST.NE_TYPE_ID_UNKNOWN };
       List<EntityMention> entities = JCasUtil.selectCovered(jCas, EntityMention.class, sentence);
-      Map<Integer, List<String>> entityTagsByType = new HashMap<Integer, List<String>>();
+      List<ChunkingExtractor> chunkingExtractors = Lists.newArrayList(); 
       for (int typeID : entityTypeIDs) {
         Predicate<EntityMention> hasTypeID = hasEntityType(typeID);
+        String name = String.format("EntityTag_%d", typeID);
         List<EntityMention> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
-        entityTagsByType.put(typeID, this.entityChunking.createOutcomes(jCas, tokens, subEntities));
+        chunkingExtractors.add(new ChunkingExtractor(name, this.entityChunking, jCas, tokens, subEntities));
       }
+      
+      // add extractor for phase chunks
+      List<Chunk> chunks = JCasUtil.selectCovered(jCas, Chunk.class, sentence);
+      chunkingExtractors.add(new ChunkingExtractor("PhraseTag", this.phraseChunking, jCas, tokens, chunks));
 
       // extract features for all tokens
       int tokenIndex = -1;
-      int window = 2;
+      int nChunkLabelsBefore = 2;
+      int nChunkLabelsAfter = 2;
       int nPreviousClassifications = 2;
 
       for (BaseToken token : tokens) {
@@ -234,16 +244,11 @@ public class EventAnnotator extends Clea
         // features from surrounding tokens
         features.addAll(this.contextFeatureExtractor.extractWithin(jCas, token, sentence));
 
-        // features from surrounding entities
-        for (int typeID : entityTypeIDs) {
-          List<String> tokenEntityTags = entityTagsByType.get(typeID);
-          int begin = Math.max(tokenIndex - window, 0);
-          int end = Math.min(tokenIndex + window, tokenEntityTags.size());
-          for (int i = begin; i < end; ++i) {
-            String name = String.format("EntityTag_%d_%d", typeID, i - begin);
-            features.add(new Feature(name, tokenEntityTags.get(i)));
-          }
+        // features from surrounding entity, phrase, etc. chunk-labels
+        for (ChunkingExtractor extractor : chunkingExtractors) {
+          features.addAll(extractor.extract(tokenIndex, nChunkLabelsBefore, nChunkLabelsAfter));
         }
+
         // features from previous classifications
         for (int i = nPreviousClassifications; i > 0; --i) {
           int index = tokenIndex - i;

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java?rev=1424790&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java Fri Dec 21 02:44:18 2012
@@ -0,0 +1,40 @@
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.List;
+
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.chunking.Chunking;
+
+import com.google.common.collect.Lists;
+
+public class ChunkingExtractor {
+
+  private String name;
+
+  private List<?> subChunkLabels;
+
+  public <SUB_CHUNK_TYPE extends Annotation, CHUNK_TYPE extends Annotation> ChunkingExtractor(
+      String name,
+      Chunking<?, SUB_CHUNK_TYPE, CHUNK_TYPE> chunking,
+      JCas jCas,
+      List<SUB_CHUNK_TYPE> subChunks,
+      List<CHUNK_TYPE> chunks) throws AnalysisEngineProcessException {
+    this.name = name;
+    this.subChunkLabels = chunking.createOutcomes(jCas, subChunks, chunks);
+  }
+
+  public List<Feature> extract(int tokenIndex, int nBefore, int nAfter) {
+    List<Feature> features = Lists.newArrayList();
+    int begin = Math.max(tokenIndex - nBefore, 0);
+    int end = Math.min(tokenIndex + nAfter + 1, this.subChunkLabels.size());
+    for (int i = begin; i < end; ++i) {
+      String featureName = String.format("%s_%d", this.name, i - begin - nBefore);
+      features.add(new Feature(featureName, this.subChunkLabels.get(i)));
+    }
+    return features;
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Fri Dec 21 02:44:18 2012
@@ -85,7 +85,8 @@ public class EvaluationOfEventSpans exte
       float probabilityOfKeepingANegativeExample,
       float featureSelectionThreshold) {
     super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, EnumSet.of(
-        AnnotatorType.PART_OF_SPEECH_TAGS));
+        AnnotatorType.PART_OF_SPEECH_TAGS,
+        AnnotatorType.CHUNKS));
         //AnnotatorType.UMLS_NAMED_ENTITIES,
         //AnnotatorType.LEXICAL_VARIANTS,
         //AnnotatorType.DEPENDENCIES,

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Fri Dec 21 02:44:18 2012
@@ -69,7 +69,7 @@ public abstract class Evaluation_ImplBas
     org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
 
   public enum AnnotatorType {
-    PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, DEPENDENCIES, SEMANTIC_ROLES
+    PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, CHUNKS, DEPENDENCIES, SEMANTIC_ROLES
   }
 
   protected final String GOLD_VIEW_NAME = "GoldView";
@@ -187,11 +187,8 @@ public abstract class Evaluation_ImplBas
           POSTagger.CASE_SENSITIVE_PARAM,
           true));
     }
-
-    // identify UMLS named entities if requested
-    if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
-      // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
+    
+    if (this.annotatorFlags.contains(AnnotatorType.CHUNKS)) {
       // identify chunks
       aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
           Chunker.class,
@@ -199,6 +196,12 @@ public abstract class Evaluation_ImplBas
           Chunker.class.getResource("../models/chunk-model.claims-1.5.zip").toURI().getPath(),
           Chunker.CHUNKER_CREATOR_CLASS_PARAM,
           DefaultChunkCreator.class));
+    }
+
+    // identify UMLS named entities if requested
+    if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
+      // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
       // adjust NP in NP NP to span both
       aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
           ChunkAdjuster.class,