You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/10 15:40:01 UTC
svn commit: r1481009 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
data/analysis/TimexTreeAlignmentStatistics.java
eval/RemoveTreeAlignedMentions.java
Author: tmill
Date: Fri May 10 13:40:01 2013
New Revision: 1481009
URL: http://svn.apache.org/r1481009
Log:
Two new classes for tree alignment.
Added:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java?rev=1481009&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/TimexTreeAlignmentStatistics.java Fri May 10 13:40:01 2013
@@ -0,0 +1,126 @@
+package org.apache.ctakes.temporal.data.analysis;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
+import org.apache.ctakes.temporal.eval.CommandLine;
+import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMIReader;
+import org.apache.ctakes.temporal.eval.THYMEData;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+public class TimexTreeAlignmentStatistics {
+ static interface Options{
+ @Option(longName = "xmi")
+ public File getXMIDirectory();
+
+ @Option(longName = "patients")
+ public CommandLine.IntegerRanges getPatients();
+
+ @Option(longName = "text")
+ public File getRawTextDirectory();
+ }
+
+ /**
+ * @param args
+ * @throws IOException
+ * @throws UIMAException
+ */
+ public static void main(String[] args) throws UIMAException, IOException {
+ Options options = CliFactory.parseArguments(Options.class, args);
+ List<Integer> patientSets = options.getPatients().getList();
+ List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
+ List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
+ List<Integer> testItems = THYMEData.getTestPatientSets(patientSets);
+
+ CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(getFilesFor(trainItems, options.getRawTextDirectory()));
+ AggregateBuilder aggregateBuilder = new AggregateBuilder();
+ aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+ aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+ XMIReader.class,
+ XMIReader.PARAM_XMI_DIRECTORY,
+ options.getXMIDirectory()));
+ AnalysisEngine ae = aggregateBuilder.createAggregate();
+ int numMentions=0;
+ int numMatches=0;
+
+ for(JCas jCas : new JCasIterable(reader, ae)){
+ // String docId = DocumentIDAnnotationUtil.getDocumentID(jCas);
+ // String docId = jCas.
+ // System.out.println("Document: " + docId);
+ for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+ Collection<TimeMention> mentions = JCasUtil.selectCovered(jCas.getView("GoldView"), TimeMention.class, segment);
+ for(TimeMention mention : mentions){
+ numMentions++;
+ boolean match = false;
+ List<TreebankNode> nodes = JCasUtil.selectCovered(jCas, TreebankNode.class, mention);
+ for(TreebankNode node : nodes){
+ if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){
+ numMatches++;
+ match = true;
+ break;
+ }
+ }
+ if(!match){
+ List<TreebankNode> coveringNodes = JCasUtil.selectCovering(jCas, TreebankNode.class, mention.getBegin(), mention.getEnd());
+ TreebankNode smallestCoveringNode = null;
+ int smallestLen = Integer.MAX_VALUE;
+ for(TreebankNode node : coveringNodes){
+ int len = node.getEnd() - node.getBegin();
+ if(len < smallestLen){
+ smallestLen = len;
+ smallestCoveringNode = node;
+ }
+ }
+ System.out.println("No alignment for: " + mention.getCoveredText());
+ System.out.println("Smallest covering treebank node is: " + (smallestCoveringNode == null ? "null" : smallestCoveringNode.getCoveredText()));
+ System.out.println(smallestCoveringNode == null ? "no tree" : TreeUtils.tree2str(smallestCoveringNode));
+ }
+ }
+ }
+ }
+ System.out.printf("Found %d mentions, %d match with node spans\n", numMentions, numMatches);
+ }
+
+ private static List<File> getFilesFor(List<Integer> patientSets, File rawTextDirectory) {
+ if ( !rawTextDirectory.exists() ) {
+ return Collections.emptyList();
+ }
+ List<File> files = new ArrayList<File>();
+ for (Integer set : patientSets) {
+ final int setNum = set;
+ for (File file : rawTextDirectory.listFiles(new FilenameFilter(){
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.contains(String.format("ID%03d", setNum));
+ }})) {
+ // skip hidden files like .svn
+ if (!file.isHidden()) {
+ files.add(file);
+ }
+ }
+ }
+ return files;
+ }
+}
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java?rev=1481009&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/RemoveTreeAlignedMentions.java Fri May 10 13:40:01 2013
@@ -0,0 +1,66 @@
+package org.apache.ctakes.temporal.eval;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.util.JCasUtil;
+
+public class RemoveTreeAlignedMentions extends JCasAnnotator_ImplBase {
+
+ public static final String PARAM_GOLDVIEW_NAME = "GOLD_VIEW_NAME";
+ public static Logger logger = Logger.getLogger(RemoveTreeAlignedMentions.class);
+
+ @ConfigurationParameter(
+ name = PARAM_GOLDVIEW_NAME,
+ mandatory = true,
+ description = "Name of the cas view of gold standard data")
+ private String goldViewName;
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+ JCas goldView = null;
+ try {
+ goldView = jCas.getView(goldViewName);
+ } catch (CASException e) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException("Could not extract gold view from jcas!", new Object[]{e});
+ }
+
+ Collection<TimeMention> times = JCasUtil.select(jCas, TimeMention.class);
+ logger.info("File contains: " + times.size() + " timex mentions from first pass.");
+
+ List<Annotation> removeList = new ArrayList<Annotation>();
+ for(TimeMention time : times){
+ List<TreebankNode> nodes = JCasUtil.selectCovered(jCas, TreebankNode.class, time);
+ boolean match = false;
+ for(TreebankNode node : nodes){
+ if(node.getBegin() == time.getBegin() && node.getEnd() == time.getEnd()){
+ // we have a match
+ match = true;
+ break;
+ }
+ }
+ if(match){
+ // add the mention since it aligns with a tree.
+ removeList.add(time);
+// time.removeFromIndexes();
+ }
+ }
+ for(Annotation mention : removeList){
+ mention.removeFromIndexes();
+ }
+ for(TimeMention time : JCasUtil.select(jCas, TimeMention.class)){
+ logger.info("Preserved time mention: " + time.getCoveredText());
+ }
+ }
+}