You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/21 03:44:18 UTC
svn commit: r1424790 - in
/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/ ae/feature/ eval/
Author: stevenbethard
Date: Fri Dec 21 02:44:18 2012
New Revision: 1424790
URL: http://svn.apache.org/viewvc?rev=1424790&view=rev
Log:
Gets rid of PhraseExtractor (which was broken code anyway) and abstracts a ChunkingExtractor out of the entity-chunk feature extraction code to replace it.
Added:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java (with props)
Removed:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
Modified:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Fri Dec 21 02:44:18 2012
@@ -23,17 +23,16 @@ import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.Random;
-import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
+import org.apache.ctakes.temporal.ae.feature.ChunkingExtractor;
import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
@@ -127,10 +126,12 @@ public class EventAnnotator extends Clea
private BIOChunking<BaseToken, EventMention> eventChunking;
+ private BIOChunking<BaseToken, Chunk> phraseChunking;
+
protected SimpleFeatureExtractor tokenFeatureExtractor;
protected CleartkExtractor contextFeatureExtractor;
-
+
private FeatureSelection<String> featureSelection;
private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
@@ -152,6 +153,10 @@ public class EventAnnotator extends Clea
BaseToken.class,
EntityMention.class,
"typeID");
+ this.phraseChunking = new BIOChunking<BaseToken, Chunk>(
+ BaseToken.class,
+ Chunk.class,
+ "chunkType");
this.eventChunking = new BIOChunking<BaseToken, EventMention>(
BaseToken.class,
EventMention.class);
@@ -160,7 +165,6 @@ public class EventAnnotator extends Clea
new CoveredTextExtractor(),
new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
new TypePathExtractor(BaseToken.class, "partOfSpeech"),
- new PhraseExtractor(),
new SRLExtractor());
this.contextFeatureExtractor = new CleartkExtractor(
BaseToken.class,
@@ -211,16 +215,22 @@ public class EventAnnotator extends Clea
CONST.NE_TYPE_ID_PROCEDURE,
CONST.NE_TYPE_ID_UNKNOWN };
List<EntityMention> entities = JCasUtil.selectCovered(jCas, EntityMention.class, sentence);
- Map<Integer, List<String>> entityTagsByType = new HashMap<Integer, List<String>>();
+ List<ChunkingExtractor> chunkingExtractors = Lists.newArrayList();
for (int typeID : entityTypeIDs) {
Predicate<EntityMention> hasTypeID = hasEntityType(typeID);
+ String name = String.format("EntityTag_%d", typeID);
List<EntityMention> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
- entityTagsByType.put(typeID, this.entityChunking.createOutcomes(jCas, tokens, subEntities));
+ chunkingExtractors.add(new ChunkingExtractor(name, this.entityChunking, jCas, tokens, subEntities));
}
+
+ // add extractor for phase chunks
+ List<Chunk> chunks = JCasUtil.selectCovered(jCas, Chunk.class, sentence);
+ chunkingExtractors.add(new ChunkingExtractor("PhraseTag", this.phraseChunking, jCas, tokens, chunks));
// extract features for all tokens
int tokenIndex = -1;
- int window = 2;
+ int nChunkLabelsBefore = 2;
+ int nChunkLabelsAfter = 2;
int nPreviousClassifications = 2;
for (BaseToken token : tokens) {
@@ -234,16 +244,11 @@ public class EventAnnotator extends Clea
// features from surrounding tokens
features.addAll(this.contextFeatureExtractor.extractWithin(jCas, token, sentence));
- // features from surrounding entities
- for (int typeID : entityTypeIDs) {
- List<String> tokenEntityTags = entityTagsByType.get(typeID);
- int begin = Math.max(tokenIndex - window, 0);
- int end = Math.min(tokenIndex + window, tokenEntityTags.size());
- for (int i = begin; i < end; ++i) {
- String name = String.format("EntityTag_%d_%d", typeID, i - begin);
- features.add(new Feature(name, tokenEntityTags.get(i)));
- }
+ // features from surrounding entity, phrase, etc. chunk-labels
+ for (ChunkingExtractor extractor : chunkingExtractors) {
+ features.addAll(extractor.extract(tokenIndex, nChunkLabelsBefore, nChunkLabelsAfter));
}
+
// features from previous classifications
for (int i = nPreviousClassifications; i > 0; --i) {
int index = tokenIndex - i;
Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java?rev=1424790&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java Fri Dec 21 02:44:18 2012
@@ -0,0 +1,40 @@
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.List;
+
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.chunking.Chunking;
+
+import com.google.common.collect.Lists;
+
+public class ChunkingExtractor {
+
+ private String name;
+
+ private List<?> subChunkLabels;
+
+ public <SUB_CHUNK_TYPE extends Annotation, CHUNK_TYPE extends Annotation> ChunkingExtractor(
+ String name,
+ Chunking<?, SUB_CHUNK_TYPE, CHUNK_TYPE> chunking,
+ JCas jCas,
+ List<SUB_CHUNK_TYPE> subChunks,
+ List<CHUNK_TYPE> chunks) throws AnalysisEngineProcessException {
+ this.name = name;
+ this.subChunkLabels = chunking.createOutcomes(jCas, subChunks, chunks);
+ }
+
+ public List<Feature> extract(int tokenIndex, int nBefore, int nAfter) {
+ List<Feature> features = Lists.newArrayList();
+ int begin = Math.max(tokenIndex - nBefore, 0);
+ int end = Math.min(tokenIndex + nAfter + 1, this.subChunkLabels.size());
+ for (int i = begin; i < end; ++i) {
+ String featureName = String.format("%s_%d", this.name, i - begin - nBefore);
+ features.add(new Feature(featureName, this.subChunkLabels.get(i)));
+ }
+ return features;
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/ChunkingExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Fri Dec 21 02:44:18 2012
@@ -85,7 +85,8 @@ public class EvaluationOfEventSpans exte
float probabilityOfKeepingANegativeExample,
float featureSelectionThreshold) {
super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, EnumSet.of(
- AnnotatorType.PART_OF_SPEECH_TAGS));
+ AnnotatorType.PART_OF_SPEECH_TAGS,
+ AnnotatorType.CHUNKS));
//AnnotatorType.UMLS_NAMED_ENTITIES,
//AnnotatorType.LEXICAL_VARIANTS,
//AnnotatorType.DEPENDENCIES,
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1424790&r1=1424789&r2=1424790&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Fri Dec 21 02:44:18 2012
@@ -69,7 +69,7 @@ public abstract class Evaluation_ImplBas
org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
public enum AnnotatorType {
- PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, DEPENDENCIES, SEMANTIC_ROLES
+ PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, CHUNKS, DEPENDENCIES, SEMANTIC_ROLES
}
protected final String GOLD_VIEW_NAME = "GoldView";
@@ -187,11 +187,8 @@ public abstract class Evaluation_ImplBas
POSTagger.CASE_SENSITIVE_PARAM,
true));
}
-
- // identify UMLS named entities if requested
- if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
- // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
- aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
+
+ if (this.annotatorFlags.contains(AnnotatorType.CHUNKS)) {
// identify chunks
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
Chunker.class,
@@ -199,6 +196,12 @@ public abstract class Evaluation_ImplBas
Chunker.class.getResource("../models/chunk-model.claims-1.5.zip").toURI().getPath(),
Chunker.CHUNKER_CREATOR_CLASS_PARAM,
DefaultChunkCreator.class));
+ }
+
+ // identify UMLS named entities if requested
+ if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
+ // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
+ aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
// adjust NP in NP NP to span both
aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
ChunkAdjuster.class,