You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2014/04/18 21:07:31 UTC

svn commit: r1588542 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration: ComputeDurationStatistics.java EventEventDurationStatistics.java EventTimeDurationStatistics.java Utils.java

Author: dligach
Date: Fri Apr 18 19:07:30 2014
New Revision: 1588542

URL: http://svn.apache.org/r1588542
Log:
Added a class to display duration information for event arguments of event-event relations. Did some refactoring for EventTimeDurationStatistics related to this.

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java   (with props)
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventTimeDurationStatistics.java
      - copied, changed from r1588525, ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/ComputeDurationStatistics.java
Removed:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/ComputeDurationStatistics.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java?rev=1588542&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java Fri Apr 18 19:07:30 2014
@@ -0,0 +1,152 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.duration;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.temporal.eval.CommandLine;
+import org.apache.ctakes.temporal.eval.THYMEData;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.Lists;
+import com.google.common.io.Files;
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+/**
+ * Analyze duration information for the relation arguments of CONTAINS relation.
+ * 
+ * @author dmitriy dligach
+ */
+public class EventEventDurationStatistics {
+
+  static interface Options {
+
+    @Option(longName = "xmi-dir")
+    public File getInputDirectory();
+
+    @Option(longName = "patients")
+    public CommandLine.IntegerRanges getPatients();
+
+    @Option(longName = "output-file")
+    public File getOutputFile();
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    Options options = CliFactory.parseArguments(Options.class, args);
+
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
+    List<File> trainFiles = Utils.getFilesFor(trainItems, options.getInputDirectory());
+    CollectionReader collectionReader = Utils.getCollectionReader(trainFiles);
+
+    AnalysisEngine annotationConsumer = AnalysisEngineFactory.createPrimitive(
+        AnalyseRelationArgumentDuration.class,
+        "OutputFile",
+        options.getOutputFile());
+
+    SimplePipeline.runPipeline(collectionReader, annotationConsumer);
+  }
+
+  /**
+   * Look at event-event relations whose event arguments have duration data.
+   */
+  public static class AnalyseRelationArgumentDuration extends JCasAnnotator_ImplBase {                                               
+
+    @ConfigurationParameter(
+        name = "OutputFile",
+        mandatory = true,
+        description = "path to the file that stores relation data")
+    private String outputFile;
+
+    public static final String GOLD_VIEW_NAME = "GoldView";
+
+    @Override                                                                                                                  
+    public void process(JCas jCas) throws AnalysisEngineProcessException {                                                     
+
+      File durationLookup = new File(Utils.durationDistributionPath);                      
+      Map<String, Map<String, Float>> textToDistribution = null;                                                                 
+      try {                                                                                                                      
+        textToDistribution = Files.readLines(durationLookup, Charsets.UTF_8, new Utils.Callback());                                    
+      } catch(IOException e) {                                                                                                   
+        e.printStackTrace();                                                                                                     
+        return;                                                                                                                  
+      }  
+
+      JCas goldView;                                                                                                           
+      try {                                                                                                                    
+        goldView = jCas.getView(GOLD_VIEW_NAME);                                                                               
+      } catch (CASException e) {                                                                                               
+        throw new AnalysisEngineProcessException(e);                                                                           
+      }                                                                                                                                                                                                                                         
+
+      // find event-time relations where both arguments have duration information
+      for(BinaryTextRelation relation : Lists.newArrayList(JCasUtil.select(goldView, BinaryTextRelation.class))) {
+        if(! relation.getCategory().equals("CONTAINS")) {
+          continue;
+        }
+        
+        RelationArgument arg1 = relation.getArg1();                                                                             
+        RelationArgument arg2 = relation.getArg2(); 
+        String event1Text;
+        String event2Text;
+        if(arg1.getArgument() instanceof EventMention && arg2.getArgument() instanceof EventMention) {
+          event1Text = Utils.normalizeEventText(jCas, arg1.getArgument());
+          event2Text = Utils.normalizeEventText(jCas, arg2.getArgument());
+        } else {
+          // this is not an event-event relation
+          continue;
+        }
+
+        if(textToDistribution.containsKey(event1Text) && textToDistribution.containsKey(event2Text)) {
+          // there is duration information for both arguments
+          float event1ExpectedDuration = Utils.expectedDuration(textToDistribution.get(event1Text));
+          float event2ExpectedDuration = Utils.expectedDuration(textToDistribution.get(event2Text));
+          String context = Utils.getTextBetweenAnnotations(goldView, arg1.getArgument(), arg2.getArgument());
+          String out = String.format("%s|%.5f|%s|%.5f|%s\n", 
+              event1Text, event1ExpectedDuration * 3650, 
+              event2Text, event2ExpectedDuration * 3650, 
+              context.length() < 80 ? context : "...");
+          try {
+            Files.append(out, new File(outputFile), Charsets.UTF_8);
+          } catch (IOException e) {
+            e.printStackTrace();
+          }
+        }
+      }
+    }
+  }
+}

Propchange: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventEventDurationStatistics.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Copied: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventTimeDurationStatistics.java (from r1588525, ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/ComputeDurationStatistics.java)
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventTimeDurationStatistics.java?p2=ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventTimeDurationStatistics.java&p1=ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/ComputeDurationStatistics.java&r1=1588525&r2=1588542&rev=1588542&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/ComputeDurationStatistics.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/EventTimeDurationStatistics.java Fri Apr 18 19:07:30 2014
@@ -19,14 +19,11 @@
 package org.apache.ctakes.temporal.duration;
 
 import java.io.File;
-import java.io.FilenameFilter;
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.ctakes.core.cr.XMIReader;
 import org.apache.ctakes.temporal.eval.CommandLine;
 import org.apache.ctakes.temporal.eval.THYMEData;
 import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
@@ -38,11 +35,9 @@ import org.apache.uima.analysis_engine.A
 import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
 import org.uimafit.component.JCasAnnotator_ImplBase;
 import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.pipeline.SimplePipeline;
 import org.uimafit.util.JCasUtil;
 
@@ -53,11 +48,11 @@ import com.lexicalscope.jewel.cli.CliFac
 import com.lexicalscope.jewel.cli.Option;
 
 /**
- * Analyze duration information for relation arguments.
+ * Analyze duration information for the relation arguments of CONTAINS relation.
  * 
  * @author dmitriy dligach
  */
-public class ComputeDurationStatistics {
+public class EventTimeDurationStatistics {
 
   static interface Options {
 
@@ -77,8 +72,8 @@ public class ComputeDurationStatistics {
 
     List<Integer> patientSets = options.getPatients().getList();
     List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
-    List<File> trainFiles = getFilesFor(trainItems, options.getInputDirectory());
-    CollectionReader collectionReader = getCollectionReader(trainFiles);
+    List<File> trainFiles = Utils.getFilesFor(trainItems, options.getInputDirectory());
+    CollectionReader collectionReader = Utils.getCollectionReader(trainFiles);
 
     AnalysisEngine annotationConsumer = AnalysisEngineFactory.createPrimitive(
         AnalyseRelationArgumentDuration.class,
@@ -88,48 +83,9 @@ public class ComputeDurationStatistics {
     SimplePipeline.runPipeline(collectionReader, annotationConsumer);
   }
 
-  private static CollectionReader getCollectionReader(List<File> inputFiles) throws Exception {
-
-    List<String> fileNames = new ArrayList<>();
-    for(File file : inputFiles) {
-      if(! (file.isHidden())) {
-        fileNames.add(file.getPath());
-      }
-    }
-
-    String[] paths = new String[fileNames.size()];
-    fileNames.toArray(paths);
-
-    return CollectionReaderFactory.createCollectionReader(
-        XMIReader.class,
-        XMIReader.PARAM_FILES,
-        paths);
-  }
-
-  private static List<File> getFilesFor(List<Integer> patientSets, File inputDirectory) {
-
-    List<File> files = new ArrayList<>();
-
-    for (Integer set : patientSets) {
-      final int setNum = set;
-      for (File file : inputDirectory.listFiles(new FilenameFilter(){
-        @Override
-        public boolean accept(File dir, String name) {
-          return name.contains(String.format("ID%03d", setNum));
-        }})) {
-        // skip hidden files like .svn
-        if (!file.isHidden()) {
-          files.add(file);
-        } 
-      }
-    }
-
-    return files;
-  }
-
   /**
-   * Preserve only those event-time relations whose event argument has duration data
-   * and whose time argument can be normalized using Steve's timex normalizer.
+   * Look at those event-time relations whose event argument has duration data
+   * and whose time argument can be normalized using Bethard timex normalizer.
    */
   public static class AnalyseRelationArgumentDuration extends JCasAnnotator_ImplBase {                                               
 
@@ -161,7 +117,11 @@ public class ComputeDurationStatistics {
       }                                                                                                                                                                                                                                         
 
       // find event-time relations where both arguments have duration information
-      for(BinaryTextRelation relation : Lists.newArrayList(JCasUtil.select(goldView, BinaryTextRelation.class))) {            
+      for(BinaryTextRelation relation : Lists.newArrayList(JCasUtil.select(goldView, BinaryTextRelation.class))) {
+        if(! relation.getCategory().equals("CONTAINS")) {
+          continue;
+        }
+        
         RelationArgument arg1 = relation.getArg1();                                                                             
         RelationArgument arg2 = relation.getArg2(); 
 
@@ -185,10 +145,11 @@ public class ComputeDurationStatistics {
           Map<String, Float> timeDistribution = Utils.convertToDistribution(timeUnits.iterator().next());
           float eventExpectedDuration = Utils.expectedDuration(eventDistribution);
           float timeExpectedDuration = Utils.expectedDuration(timeDistribution);
-          String context = getTextBetweenAnnotations(goldView, arg1.getArgument(), arg2.getArgument());
+          String context = Utils.getTextBetweenAnnotations(goldView, arg1.getArgument(), arg2.getArgument());
           String out = String.format("%s|%.5f|%s|%.5f|%s\n", 
               timeUnits.iterator().next(), timeExpectedDuration * 3650, 
-              eventText, eventExpectedDuration * 3650, context.length() < 80 ? context : "...");
+              eventText, eventExpectedDuration * 3650, 
+              context.length() < 80 ? context : "...");
           try {
             Files.append(out, new File(outputFile), Charsets.UTF_8);
           } catch (IOException e) {
@@ -197,22 +158,5 @@ public class ComputeDurationStatistics {
         }
       }
     }
-
-
-    /** 
-     * Get relation context.
-     */
-    private static String getTextBetweenAnnotations(JCas jCas, Annotation arg1, Annotation arg2) {
-
-      final int windowSize = 5;
-      String text = jCas.getDocumentText();
-
-      int leftArgBegin = Math.min(arg1.getBegin(), arg2.getBegin());
-      int rightArgEnd = Math.max(arg1.getEnd(), arg2.getEnd());
-      int begin = Math.max(0, leftArgBegin - windowSize);
-      int end = Math.min(text.length(), rightArgEnd + windowSize); 
-
-      return text.substring(begin, end).replaceAll("[\r\n]", " ");
-    }
   }
 }

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java?rev=1588542&r1=1588541&r2=1588542&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java Fri Apr 18 19:07:30 2014
@@ -7,9 +7,12 @@ import info.bethard.timenorm.TemporalExp
 import info.bethard.timenorm.TimeSpan;
 import info.bethard.timenorm.TimeSpanSet;
 
+import java.io.File;
+import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -17,16 +20,19 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.ctakes.core.cr.XMIReader;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.temporal.ae.feature.duration.DurationEventTimeFeatureExtractor;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.threeten.bp.temporal.TemporalField;
 import org.threeten.bp.temporal.TemporalUnit;
+import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.util.JCasUtil;
 
 import scala.collection.immutable.Set;
@@ -223,6 +229,22 @@ public class Utils {
     Joiner joiner = Joiner.on(separator);
     return joiner.join(distribution);
   }
+  
+  /** 
+   * Get relation context.
+   */
+  public static String getTextBetweenAnnotations(JCas jCas, Annotation arg1, Annotation arg2) {
+
+    final int windowSize = 5;
+    String text = jCas.getDocumentText();
+
+    int leftArgBegin = Math.min(arg1.getBegin(), arg2.getBegin());
+    int rightArgEnd = Math.max(arg1.getEnd(), arg2.getEnd());
+    int begin = Math.max(0, leftArgBegin - windowSize);
+    int end = Math.min(text.length(), rightArgEnd + windowSize); 
+
+    return text.substring(begin, end).replaceAll("[\r\n]", " ");
+  }
 
   /**
    * Lemmatize word using ClearNLP lemmatizer.
@@ -335,6 +357,52 @@ public class Utils {
     }
   }
   
+  /**
+   * Instantiate an XMI collection reader.
+   */
+  public static CollectionReader getCollectionReader(List<File> inputFiles) throws Exception {
+
+    List<String> fileNames = new ArrayList<>();
+    for(File file : inputFiles) {
+      if(! (file.isHidden())) {
+        fileNames.add(file.getPath());
+      }
+    }
+
+    String[] paths = new String[fileNames.size()];
+    fileNames.toArray(paths);
+
+    return CollectionReaderFactory.createCollectionReader(
+        XMIReader.class,
+        XMIReader.PARAM_FILES,
+        paths);
+  }
+
+  /**
+   * Get files for specific sets of patients.
+   * Useful for selecting e.g. only training files.
+   */
+  public static List<File> getFilesFor(List<Integer> patientSets, File inputDirectory) {
+
+    List<File> files = new ArrayList<>();
+
+    for (Integer set : patientSets) {
+      final int setNum = set;
+      for (File file : inputDirectory.listFiles(new FilenameFilter(){
+        @Override
+        public boolean accept(File dir, String name) {
+          return name.contains(String.format("ID%03d", setNum));
+        }})) {
+        // skip hidden files like .svn
+        if (!file.isHidden()) {
+          files.add(file);
+        } 
+      }
+    }
+
+    return files;
+  }
+  
   public static void main(String[] args) throws IOException {
     
     HashSet<String> timeUnits = getTimeUnits("three months");