You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ma...@apache.org on 2013/01/17 22:31:28 UTC
svn commit: r1434924 -
/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
Author: mattcoarr
Date: Thu Jan 17 21:31:28 2013
New Revision: 1434924
URL: http://svn.apache.org/viewvc?rev=1434924&view=rev
Log:
* some feature tuning [courtesy of Ben Wellner (MITRE)]
* some work in incorporate dependency parser features [courtesy of Ben Wellner (MITRE)]
Modified:
incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
Modified: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java?rev=1434924&r1=1434923&r2=1434924&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java (original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Thu Jan 17 21:31:28 2013
@@ -25,6 +25,8 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import org.apache.uima.jcas.tcas.Annotation;
+
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
@@ -45,6 +47,7 @@ import org.cleartk.classifier.feature.ex
import org.cleartk.classifier.feature.extractor.ContextExtractor.Covered;
import org.cleartk.classifier.feature.extractor.ContextExtractor.Preceding;
import org.cleartk.classifier.feature.extractor.ContextExtractor.Following;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
import org.cleartk.classifier.feature.extractor.simple.SpannedTextExtractor;
@@ -57,13 +60,14 @@ import org.cleartk.classifier.feature.pr
import org.cleartk.classifier.opennlp.DefaultMaxentDataWriterFactory;
import org.cleartk.classifier.opennlp.MaxentDataWriterFactory_ImplBase;
import org.cleartk.type.test.Token;
+import org.cleartk.classifier.Feature;
import org.uimafit.descriptor.ConfigurationParameter;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.ConfigurationParameterFactory;
import org.uimafit.util.JCasUtil;
import org.apache.commons.lang.StringUtils;
-import org.apache.ctakes.assertion.medfacts.cleartk.extractors.SurroundingExtractor;
+
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
@@ -71,6 +75,8 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+
public abstract class AssertionCleartkAnalysisEngine extends
CleartkAnnotator<String>
{
@@ -95,14 +101,30 @@ public abstract class AssertionCleartkAn
defaultValue = "false")
boolean printErrors;
+ public ConllDependencyNode findAnnotationHead(JCas jcas, Annotation annotation) {
+
+ for (ConllDependencyNode depNode : JCasUtil.selectCovered(jcas, ConllDependencyNode.class, annotation)) {
+
+ ConllDependencyNode head = depNode.getHead();
+ if (head == null || head.getEnd() <= annotation.getBegin() || head.getBegin() > annotation.getEnd()) {
+ // The head is outside the bounds of the annotation, so this node must be the annotation's head
+ return depNode;
+ }
+ }
+ // Can this happen?
+ return null;
+ }
+
+
//private SimpleFeatureExtractor tokenFeatureExtractor;
protected List<ContextExtractor<IdentifiedAnnotation>> contextFeatureExtractors;
protected List<ContextExtractor<BaseToken>> tokenContextFeatureExtractors;
+ protected List<CleartkExtractor> tokenCleartkExtractors;
protected List<SimpleFeatureExtractor> entityFeatureExtractors;
- private List<SimpleFeatureExtractor> surroundingFeatureExtractors;
+ @SuppressWarnings("deprecation")
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
@@ -129,54 +151,43 @@ public abstract class AssertionCleartkAn
// a list of feature extractors that require the token and the sentence
this.contextFeatureExtractors = new ArrayList<ContextExtractor<IdentifiedAnnotation>>();
+
+ this.tokenCleartkExtractors = new ArrayList<CleartkExtractor>();
+
+ CleartkExtractor tokenExtraction1 =
+ new CleartkExtractor(
+ BaseToken.class,
+ new CoveredTextExtractor(),
+ //new CleartkExtractor.Covered(),
+ new CleartkExtractor.LastCovered(2),
+ new CleartkExtractor.Preceding(5),
+ new CleartkExtractor.Following(4),
+ new CleartkExtractor.Bag(new CleartkExtractor.Preceding(10)),
+ new CleartkExtractor.Bag(new CleartkExtractor.Following(10))
+ );
+
+ CleartkExtractor posExtraction1 =
+ new CleartkExtractor(
+ BaseToken.class,
+ new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+ new CleartkExtractor.LastCovered(2),
+ new CleartkExtractor.Preceding(3),
+ new CleartkExtractor.Following(2)
+ );
+
+ this.tokenCleartkExtractors.add(tokenExtraction1);
+ //this.tokenCleartkExtractors.add(posExtraction1);
+
this.contextFeatureExtractors.add(new ContextExtractor<IdentifiedAnnotation>(
IdentifiedAnnotation.class,
new CoveredTextExtractor(),
//new TypePathExtractor(IdentifiedAnnotation.class, "stem"),
new Preceding(2),
new Following(2)));
-
- ContextExtractor<BaseToken> tokenContextExtractor1 = new ContextExtractor<BaseToken>(
- BaseToken.class,
- new SpannedTextExtractor(),
- new ContextExtractor.Ngram(new Covered()),
-
- new ContextExtractor.Ngram(new Preceding(1)),
- new ContextExtractor.Ngram(new Preceding(2)),
- //new ContextExtractor.Ngram(new Preceding(1, 2)),
- new ContextExtractor.Ngram(new Preceding(3)),
- //new ContextExtractor.Ngram(new Preceding(2, 3)),
- new ContextExtractor.Ngram(new Following(1)),
- new ContextExtractor.Ngram(new Following(2)),
- //new ContextExtractor.Ngram(new Following(1, 2)),
- new ContextExtractor.Ngram(new Following(3))
- //new ContextExtractor.Ngram(new Following(2,3))
- );
- tokenContextFeatureExtractors = new ArrayList<ContextExtractor<BaseToken>>();
- tokenContextFeatureExtractors.add(tokenContextExtractor1);
- TypePathExtractor posExtractor = new TypePathExtractor(BaseToken.class, "partOfSpeech");
- ContextExtractor<BaseToken> extractor2 = new ContextExtractor<BaseToken>(
- BaseToken.class,
- posExtractor,
- new ContextExtractor.Ngram(new Covered()),
- new ContextExtractor.Ngram(new Preceding(1)),
- new ContextExtractor.Ngram(new Preceding(2)),
- new ContextExtractor.Ngram(new Following(1)),
- new ContextExtractor.Ngram(new Following(2))
- /*
- new ContextExtractor.Covered(),
- new ContextExtractor.Ngram(new Covered())
-
- new ContextExtractor.Ngram(new Preceding(1)),
- new ContextExtractor.Ngram(new Preceding(2)),
- */
- );
- tokenContextFeatureExtractors.add(extractor2);
-
- this.surroundingFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
- SimpleFeatureExtractor surround1 = new SurroundingExtractor();
- this.surroundingFeatureExtractors.add(surround1);
+ // stab at dependency-based features
+ //List<Feature> features = new ArrayList<Feature>();
+ //ConllDependencyNode node1 = findAnnotationHead(jCas, arg1);
}
@@ -264,6 +275,7 @@ public abstract class AssertionCleartkAn
}
//Sentence sentence = sentenceList.iterator().next();
+ /*
if (sentence != null)
{
for (ContextExtractor<IdentifiedAnnotation> extractor : this.contextFeatureExtractors) {
@@ -274,24 +286,24 @@ public abstract class AssertionCleartkAn
// TODO extract context features for annotations that don't fall within a sentence
logger.log(Level.WARN, "FIXME/TODO: generate context features for entities that don't fall within a sentence");
}
+ */
+ /*
for (ContextExtractor<BaseToken> extractor : this.tokenContextFeatureExtractors) {
- instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
- }
+ instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
+ }
+ */
+ for (CleartkExtractor extractor : this.tokenCleartkExtractors) {
+ //instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence));
+ instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
+ }
+
+
+ /*
for (SimpleFeatureExtractor extractor : this.entityFeatureExtractors) {
instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
}
-
- for (SimpleFeatureExtractor extractor : this.surroundingFeatureExtractors)
- {
- instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
- }
-
- logger.log(Level.DEBUG, String.format("[%s] expected: ''; actual: ''; features: %s",
- this.getClass().getSimpleName(),
- instance.toString()
- //StringUtils.join(instance.getFeatures(), ", ")
- ));
+ */
setClassLabel(entityMention, instance);