You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/04/30 22:44:14 UTC
svn commit: r1477818 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/ConstituencyBasedTimeAnnotator.java
eval/EvaluationOfAnnotationSpans_ImplBase.java
eval/EvaluationOfTimeSpans.java
Author: tmill
Date: Tue Apr 30 20:44:13 2013
New Revision: 1477818
URL: http://svn.apache.org/r1477818
Log:
Added constituency parse-based annotator for time spans.
Added:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java?rev=1477818&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java Tue Apr 30 20:44:13 2013
@@ -0,0 +1,149 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.CleartkAnnotator;
+import org.cleartk.classifier.CleartkProcessingException;
+import org.cleartk.classifier.DataWriter;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
+import static org.cleartk.classifier.feature.extractor.CleartkExtractor.*;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
+import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.util.JCasUtil;
+
+public class ConstituencyBasedTimeAnnotator extends
+TemporalEntityAnnotator_ImplBase {
+
+ private static final String NON_MENTION = "NON_TIME_MENTION";
+ private static final String MENTION = "TIME_MENTION";
+
+
+ public static AnalysisEngineDescription createDataWriterDescription(
+ Class<? extends DataWriter<String>> dataWriterClass,
+ File outputDirectory) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createPrimitiveDescription(
+ ConstituencyBasedTimeAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ true,
+ DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+ dataWriterClass,
+ DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+ outputDirectory);
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createPrimitiveDescription(
+ ConstituencyBasedTimeAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ new File(modelDirectory, "model.jar"));
+ }
+
+ protected List<SimpleFeatureExtractor> featureExtractors;
+
+ @Override
+ public void initialize(UimaContext context)
+ throws ResourceInitializationException {
+ super.initialize(context);
+
+ CombinedExtractor allExtractors = new CombinedExtractor(
+ new CoveredTextExtractor(),
+ new CharacterCategoryPatternExtractor(PatternType.REPEATS_MERGED),
+ new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+ new TypePathExtractor(BaseToken.class, "partOfSpeech"));
+
+ featureExtractors = new ArrayList<SimpleFeatureExtractor>();
+// featureExtractors.add(new CleartkExtractor(BaseToken.class, new CoveredTextExtractor(), new Bag(new Covered())));
+ featureExtractors.add(new CleartkExtractor(BaseToken.class, allExtractors, new Bag(new Covered())));
+ // bag of constituent descendent labels
+// featureExtractors.add(new CleartkExtractor(TreebankNode.class, new TypePathExtractor(TreebankNode.class, "nodeType"), new Bag(new Covered())));
+
+ }
+
+ @Override
+ public void process(JCas jCas, Segment segment)
+ throws AnalysisEngineProcessException {
+
+ for(TopTreebankNode root : JCasUtil.selectCovered(TopTreebankNode.class, segment)){
+ processNode(jCas, root.getChildren(0));
+ }
+ }
+
+ private void processNode(JCas jCas, TreebankNode node) throws CleartkProcessingException {
+ // accumulate features:
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ String category = NON_MENTION;
+
+ // node-based features
+ if(node.getParent().getParent() == null) features.add(new Feature("IS_ROOT"));
+ features.add(new Feature("NODE_LABEL", node.getNodeType()));
+ features.add(new Feature("PARENT_LABEL", node.getParent().getNodeType()));
+
+ if(node.getLeaf()){
+ features.add(new Feature("IS_LEAF"));
+ }else{
+ StringBuilder buffer = new StringBuilder();
+ for(int i = 0; i < node.getChildren().size(); i++){
+ buffer.append(node.getChildren(i).getNodeType());
+ buffer.append("_");
+ }
+ features.add(new Feature("PRODUCTION", buffer.toString()));
+ }
+
+ // other feature types:
+ for(SimpleFeatureExtractor extractor : featureExtractors){
+ features.addAll(extractor.extract(jCas, node));
+ }
+
+ if(this.isTraining()){
+ List<TimeMention> goldMentions = JCasUtil.selectCovered(TimeMention.class, node);
+ for(TimeMention mention : goldMentions){
+ if(mention.getBegin() == node.getBegin() && mention.getEnd() == node.getEnd()){
+ category = MENTION;
+ }
+ }
+ this.dataWriter.write(new Instance<String>(category, features));
+ }else{
+ category = this.classifier.classify(features);
+ if(category.equals(MENTION)){
+ // add to cas
+ TimeMention mention = new TimeMention(jCas, node.getBegin(), node.getEnd());
+ mention.addToIndexes();
+ }
+ }
+
+ // now do children if not a leaf & not a mention
+ if(node.getLeaf() || MENTION.equals(category)) return;
+
+ for(int i = 0; i < node.getChildren().size(); i++){
+ TreebankNode child = node.getChildren(i);
+ processNode(jCas, child);
+ }
+ }
+
+}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1477818&r1=1477817&r2=1477818&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Tue Apr 30 20:44:13 2013
@@ -72,8 +72,9 @@ public abstract class EvaluationOfAnnota
File rawTextDirectory,
File knowtatorXMLDirectory,
File xmiDirectory,
+ File treebankDirectory,
Class<? extends Annotation> annotationClass) {
- super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, null);
+ super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, treebankDirectory);
this.annotationClass = annotationClass;
}
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1477818&r1=1477817&r2=1477818&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Tue Apr 30 20:44:13 2013
@@ -23,7 +23,7 @@ import java.util.Collection;
import java.util.List;
import java.util.logging.Level;
-import org.apache.ctakes.temporal.ae.TimeAnnotator;
+import org.apache.ctakes.temporal.ae.ConstituencyBasedTimeAnnotator;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.jcas.JCas;
@@ -46,7 +46,8 @@ public class EvaluationOfTimeSpans exten
new File("target/eval/time-spans"),
options.getRawTextDirectory(),
options.getKnowtatorXMLDirectory(),
- options.getXMIDirectory());
+ options.getXMIDirectory(),
+ options.getTreebankDirectory());
evaluation.prepareXMIsFor(patientSets);
evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-time-errors.log"));
AnnotationStatistics<String> stats = evaluation.trainAndTest(trainItems, devItems);
@@ -57,14 +58,15 @@ public class EvaluationOfTimeSpans exten
File baseDirectory,
File rawTextDirectory,
File knowtatorXMLDirectory,
- File xmiDirectory) {
- super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, TimeMention.class);
+ File xmiDirectory,
+ File treebankDirectory) {
+ super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, treebankDirectory, TimeMention.class);
}
@Override
protected AnalysisEngineDescription getDataWriterDescription(File directory)
throws ResourceInitializationException {
- return TimeAnnotator.createDataWriterDescription(LIBSVMStringOutcomeDataWriter.class, directory);
+ return ConstituencyBasedTimeAnnotator.createDataWriterDescription(LIBSVMStringOutcomeDataWriter.class, directory);
}
@Override
@@ -75,7 +77,7 @@ public class EvaluationOfTimeSpans exten
@Override
protected AnalysisEngineDescription getAnnotatorDescription(File directory)
throws ResourceInitializationException {
- return TimeAnnotator.createAnnotatorDescription(directory);
+ return ConstituencyBasedTimeAnnotator.createAnnotatorDescription(directory);
}
@Override