You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/04/30 22:44:14 UTC

svn commit: r1477818 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ConstituencyBasedTimeAnnotator.java eval/EvaluationOfAnnotationSpans_ImplBase.java eval/EvaluationOfTimeSpans.java

Author: tmill
Date: Tue Apr 30 20:44:13 2013
New Revision: 1477818

URL: http://svn.apache.org/r1477818
Log:
Added constituency parse-based annotator for time spans.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java?rev=1477818&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/ConstituencyBasedTimeAnnotator.java Tue Apr 30 20:44:13 2013
@@ -0,0 +1,149 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.CleartkAnnotator;
+import org.cleartk.classifier.CleartkProcessingException;
+import org.cleartk.classifier.DataWriter;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
+import static org.cleartk.classifier.feature.extractor.CleartkExtractor.*;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
+import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.util.JCasUtil;
+
+public class ConstituencyBasedTimeAnnotator extends
+TemporalEntityAnnotator_ImplBase {
+
+  private static final String NON_MENTION = "NON_TIME_MENTION";
+  private static final String MENTION = "TIME_MENTION";
+
+
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<String>> dataWriterClass,
+          File outputDirectory) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        ConstituencyBasedTimeAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        ConstituencyBasedTimeAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        new File(modelDirectory, "model.jar"));
+  }
+
+  protected List<SimpleFeatureExtractor> featureExtractors;
+
+  @Override
+  public void initialize(UimaContext context)
+      throws ResourceInitializationException {
+    super.initialize(context);
+
+    CombinedExtractor allExtractors = new CombinedExtractor(
+        new CoveredTextExtractor(),
+        new CharacterCategoryPatternExtractor(PatternType.REPEATS_MERGED),
+        new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+        new TypePathExtractor(BaseToken.class, "partOfSpeech"));
+
+    featureExtractors = new ArrayList<SimpleFeatureExtractor>();
+//    featureExtractors.add(new CleartkExtractor(BaseToken.class, new CoveredTextExtractor(), new Bag(new Covered())));
+    featureExtractors.add(new CleartkExtractor(BaseToken.class, allExtractors, new Bag(new Covered())));
+    // bag of constituent descendent labels
+//    featureExtractors.add(new CleartkExtractor(TreebankNode.class, new TypePathExtractor(TreebankNode.class, "nodeType"), new Bag(new Covered())));
+    
+  }
+  
+  @Override
+  public void process(JCas jCas, Segment segment)
+      throws AnalysisEngineProcessException {
+
+    for(TopTreebankNode root : JCasUtil.selectCovered(TopTreebankNode.class, segment)){
+      processNode(jCas, root.getChildren(0));
+    }
+  }
+
+  private void processNode(JCas jCas, TreebankNode node) throws CleartkProcessingException {
+    // accumulate features:
+    ArrayList<Feature> features = new ArrayList<Feature>();
+    String category = NON_MENTION;
+
+    // node-based features
+    if(node.getParent().getParent() == null) features.add(new Feature("IS_ROOT"));
+    features.add(new Feature("NODE_LABEL", node.getNodeType()));
+    features.add(new Feature("PARENT_LABEL", node.getParent().getNodeType()));
+    
+    if(node.getLeaf()){
+      features.add(new Feature("IS_LEAF"));
+    }else{
+      StringBuilder buffer = new StringBuilder();
+      for(int i = 0; i < node.getChildren().size(); i++){
+        buffer.append(node.getChildren(i).getNodeType());
+        buffer.append("_");
+      }
+      features.add(new Feature("PRODUCTION", buffer.toString()));
+    }
+    
+    // other feature types:
+    for(SimpleFeatureExtractor extractor : featureExtractors){
+      features.addAll(extractor.extract(jCas, node));
+    }
+    
+    if(this.isTraining()){
+      List<TimeMention> goldMentions = JCasUtil.selectCovered(TimeMention.class, node);
+      for(TimeMention mention : goldMentions){
+        if(mention.getBegin() == node.getBegin() && mention.getEnd() == node.getEnd()){
+          category = MENTION;
+        }
+      }
+      this.dataWriter.write(new Instance<String>(category, features));
+    }else{
+      category = this.classifier.classify(features);
+      if(category.equals(MENTION)){
+        // add to cas
+        TimeMention mention = new TimeMention(jCas, node.getBegin(), node.getEnd());
+        mention.addToIndexes();
+      }
+    }
+
+    // now do children if not a leaf & not a mention
+    if(node.getLeaf() || MENTION.equals(category)) return;
+
+    for(int i = 0; i < node.getChildren().size(); i++){
+      TreebankNode child = node.getChildren(i);
+      processNode(jCas, child);
+    }
+  }
+
+}

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1477818&r1=1477817&r2=1477818&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Tue Apr 30 20:44:13 2013
@@ -72,8 +72,9 @@ public abstract class EvaluationOfAnnota
       File rawTextDirectory,
       File knowtatorXMLDirectory,
       File xmiDirectory,
+      File treebankDirectory,
       Class<? extends Annotation> annotationClass) {
-    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, null);
+    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, treebankDirectory);
     this.annotationClass = annotationClass;
   }
   

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1477818&r1=1477817&r2=1477818&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Tue Apr 30 20:44:13 2013
@@ -23,7 +23,7 @@ import java.util.Collection;
 import java.util.List;
 import java.util.logging.Level;
 
-import org.apache.ctakes.temporal.ae.TimeAnnotator;
+import org.apache.ctakes.temporal.ae.ConstituencyBasedTimeAnnotator;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.jcas.JCas;
@@ -46,7 +46,8 @@ public class EvaluationOfTimeSpans exten
         new File("target/eval/time-spans"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory(),
-        options.getXMIDirectory());
+        options.getXMIDirectory(),
+        options.getTreebankDirectory());
     evaluation.prepareXMIsFor(patientSets);
     evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-time-errors.log"));
     AnnotationStatistics<String> stats = evaluation.trainAndTest(trainItems, devItems);
@@ -57,14 +58,15 @@ public class EvaluationOfTimeSpans exten
       File baseDirectory,
       File rawTextDirectory,
       File knowtatorXMLDirectory,
-      File xmiDirectory) {
-    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, TimeMention.class);
+      File xmiDirectory,
+      File treebankDirectory) {
+    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, xmiDirectory, treebankDirectory, TimeMention.class);
   }
 
   @Override
   protected AnalysisEngineDescription getDataWriterDescription(File directory)
       throws ResourceInitializationException {
-    return TimeAnnotator.createDataWriterDescription(LIBSVMStringOutcomeDataWriter.class, directory);
+    return ConstituencyBasedTimeAnnotator.createDataWriterDescription(LIBSVMStringOutcomeDataWriter.class, directory);
   }
 
   @Override
@@ -75,7 +77,7 @@ public class EvaluationOfTimeSpans exten
   @Override
   protected AnalysisEngineDescription getAnnotatorDescription(File directory)
       throws ResourceInitializationException {
-    return TimeAnnotator.createAnnotatorDescription(directory);
+    return ConstituencyBasedTimeAnnotator.createAnnotatorDescription(directory);
   }
 
   @Override