You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2017/03/29 19:30:54 UTC

svn commit: r1789408 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/THYMEAnaforaXMLReader.java ae/THYMEQAAnaforaXMLReader.java utils/AnnotationIdCollection.java utils/TLinkTypeArray2.java

Author: tmill
Date: Wed Mar 29 19:30:54 2017
New Revision: 1789408

URL: http://svn.apache.org/viewvc?rev=1789408&view=rev
Log:
Minor fixes to thyme reader to enable qa anafora reading. Switched to using TemporalTypeRelation from Binary

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEQAAnaforaXMLReader.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEAnaforaXMLReader.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/AnnotationIdCollection.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TLinkTypeArray2.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEAnaforaXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEAnaforaXMLReader.java?rev=1789408&r1=1789407&r2=1789408&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEAnaforaXMLReader.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEAnaforaXMLReader.java Wed Mar 29 19:30:54 2017
@@ -69,7 +69,7 @@ public class THYMEAnaforaXMLReader exten
       name = PARAM_ANAFORA_DIRECTORY,
       description = "root directory of the Anafora-annotated files, with one subdirectory for "
           + "each annotated file")
-  private File anaforaDirectory;
+  protected File anaforaDirectory;
 
   public static final String PARAM_ANAFORA_XML_SUFFIXES = "anaforaSuffixes";
 
@@ -78,7 +78,7 @@ public class THYMEAnaforaXMLReader exten
       mandatory = false,
       description = "list of suffixes that might be added to a file name to identify the Anafora "
           + "XML annotations file; only the first suffix corresponding to a file will be used")
-  private String[] anaforaXMLSuffixes = new String[] {
+  protected String[] anaforaXMLSuffixes = new String[] {
       ".Temporal-Relations.gold.completed.xml",
       ".Temporal-Relation.gold.completed.xml",
           ".Temporal.dave.completed.xml",
@@ -88,7 +88,8 @@ public class THYMEAnaforaXMLReader exten
       ".temporal.Temporal-Entities.gold.completed.xml",
       ".Temporal-Entity.gold.completed.xml",
       ".Gold_Temporal_Entities.xml",
-      ".Gold_Temporal_Relations.xml"};
+      ".Gold_Temporal_Relations.xml"
+      };
 
   public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
     return AnalysisEngineFactory.createEngineDescription(THYMEAnaforaXMLReader.class);

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEQAAnaforaXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEQAAnaforaXMLReader.java?rev=1789408&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEQAAnaforaXMLReader.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMEQAAnaforaXMLReader.java Wed Mar 29 19:30:54 2017
@@ -0,0 +1,336 @@
+package org.apache.ctakes.temporal.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class THYMEQAAnaforaXMLReader extends THYMEAnaforaXMLReader {
+	private static Logger LOGGER = Logger.getLogger(THYMEQAAnaforaXMLReader.class);
+	
+  public static AnalysisEngineDescription getDescription(File anaforaDirectory)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        THYMEQAAnaforaXMLReader.class,
+        THYMEAnaforaXMLReader.PARAM_ANAFORA_XML_SUFFIXES,
+        new String[]{ ".THYME_QA.timi4508.completed.xml",
+            ".THYME_QA.gusa3085.completed.xml",
+            ".THYME_QA.bethard.completed.xml",
+            ".THYME_QA.dligach.completed.xml"},
+        THYMEAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
+        anaforaDirectory);
+  }
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    // determine source text file
+    File textFile = new File(ViewUriUtil.getURI(jCas));
+    LOGGER.info("processing " + textFile);
+
+    // determine possible Anafora XML file names
+    File corefFile = new File(textFile.getPath() + ".Coreference.gold.completed.xml");
+    List<File> possibleXMLFiles = Lists.newArrayList();
+    for (String anaforaXMLSuffix : this.anaforaXMLSuffixes) {
+      if (this.anaforaDirectory == null) {
+        possibleXMLFiles.add(new File(textFile + anaforaXMLSuffix));
+      } else {
+        possibleXMLFiles.add(new File(textFile.getPath() + anaforaXMLSuffix));
+      }
+    }
+
+    // find an Anafora XML file that actually exists
+    File xmlFile = null;
+    for (File possibleXMLFile : possibleXMLFiles) {
+      if (possibleXMLFile.exists()) {
+        xmlFile = possibleXMLFile;
+        break;
+      }
+    }
+    if (this.anaforaXMLSuffixes.length > 0 && xmlFile == null) {
+      throw new IllegalArgumentException("no Anafora XML file found from " + possibleXMLFiles);
+    }
+
+    if(xmlFile != null){
+      processXmlFile(jCas, xmlFile);
+    }
+    if(corefFile.exists()){
+    	processXmlFile(jCas, corefFile);
+    }
+  }
+
+  private static void processXmlFile(JCas jCas, File xmlFile) throws AnalysisEngineProcessException{
+    // load the XML
+    Element dataElem;
+    try {
+      dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+    } catch (MalformedURLException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (JDOMException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (IOException e) {
+      throw new AnalysisEngineProcessException(e);
+    }
+
+    int curEventId = 1;
+    int curTimexId = 1;
+    int curRelId = 1;
+    int docLen = jCas.getDocumentText().length();
+    Map<String, List<String>> questionRelations = Maps.newHashMap();
+    
+    for (Element annotationsElem : dataElem.getChildren("annotations")) {
+
+    	// TODO -- need mapping from id to relation
+      Map<String, Annotation> idToAnnotation = Maps.newHashMap();
+      for (Element entityElem : annotationsElem.getChildren("entity")) {
+        String id = removeSingleChildText(entityElem, "id", null);
+        Element spanElem = removeSingleChild(entityElem, "span", id);
+        String type = removeSingleChildText(entityElem, "type", id);
+        Element propertiesElem = removeSingleChild(entityElem, "properties", id);
+
+        // UIMA doesn't support disjoint spans, so take the span enclosing
+        // everything
+        int begin = Integer.MAX_VALUE;
+        int end = Integer.MIN_VALUE;
+        for (String spanString : spanElem.getText().split(";")) {
+          String[] beginEndStrings = spanString.split(",");
+          if (beginEndStrings.length != 2) {
+            error("span not of the format 'number,number'", id);
+          }
+          int spanBegin = Integer.parseInt(beginEndStrings[0]);
+          int spanEnd = Integer.parseInt(beginEndStrings[1]);
+          if (spanBegin < begin) {
+            begin = spanBegin;
+          }
+          if (spanEnd > end) {
+            end = spanEnd;
+          }
+        }
+        if(begin < 0 || end >= docLen){
+          error("Illegal begin or end boundary", id);
+          continue;
+        }
+
+        Annotation annotation;
+        if (type.equals("EVENT")) {
+          String docTimeRel = removeSingleChildText(propertiesElem, "DocTimeRel", id);
+          if (docTimeRel == null) {
+            error("no docTimeRel, assuming OVERLAP", id);
+            docTimeRel = "OVERLAP";
+          }
+          String polarity = removeSingleChildText(propertiesElem, "Polarity", id);
+          EventMention eventMention = new EventMention(jCas, begin, end);
+          Event event = new Event(jCas);
+          EventProperties eventProperties = new EventProperties(jCas);
+          eventProperties.setDocTimeRel(docTimeRel);
+          if (polarity.equals("POS")) {
+            eventProperties.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
+          } else if (polarity.equals("NEG")) {
+            eventProperties.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
+          } else {
+            error("polarity that was not POS or NEG", id);
+          }
+          eventProperties.addToIndexes();
+          event.setConfidence(1.0f);
+          event.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
+          event.setProperties(eventProperties);
+          event.setMentions(new FSArray(jCas, 1));
+          event.setMentions(0, eventMention);
+          event.addToIndexes();
+          eventMention.setId(curEventId++);
+          eventMention.setConfidence(1.0f);
+          eventMention.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
+          eventMention.setEvent(event);
+          eventMention.addToIndexes();
+          annotation = eventMention;
+
+        } else if (type.equals("TIMEX3")) {
+          String timeClass = removeSingleChildText(propertiesElem, "Class", id);
+          TimeMention timeMention = new TimeMention(jCas, begin, end);
+          timeMention.setId(curTimexId++);
+          timeMention.setTimeClass(timeClass);
+          timeMention.addToIndexes();
+          annotation = timeMention;
+
+        } else {
+          throw new UnsupportedOperationException("unsupported entity type: " + type);
+        }
+
+        // match the annotation to it's ID for later use
+        idToAnnotation.put(id, annotation);
+
+        // make sure all XML has been consumed
+        removeSingleChild(entityElem, "parentsType", id);
+        if (!propertiesElem.getChildren().isEmpty() || !entityElem.getChildren().isEmpty()) {
+          List<String> children = Lists.newArrayList();
+          for (Element child : propertiesElem.getChildren()) {
+            children.add(child.getName());
+          }
+          for (Element child : entityElem.getChildren()) {
+            children.add(child.getName());
+          }
+          error("unprocessed children " + children, id);
+        }
+      }
+
+      for (Element relationElem : annotationsElem.getChildren("relation")) {
+        String id = removeSingleChildText(relationElem, "id", null);
+        String type = removeSingleChildText(relationElem, "type", id);
+        Element propertiesElem = removeSingleChild(relationElem, "properties", id);
+
+        if (type.equals("TLINK")) {
+          String sourceID = removeSingleChildText(propertiesElem, "Source", id);
+          String targetID = removeSingleChildText(propertiesElem, "Target", id);
+          String tlinkType = removeSingleChildText(propertiesElem, "Type", id);
+          TemporalTextRelation relation = new TemporalTextRelation(jCas);
+          relation.setId(curRelId++);
+          addRelation(jCas, relation, sourceID, targetID, tlinkType, idToAnnotation, id);
+
+        } else if (type.equals("ALINK")) {
+          String sourceID = removeSingleChildText(propertiesElem, "Source", id);
+          String targetID = removeSingleChildText(propertiesElem, "Target", id);
+          String alinkType = removeSingleChildText(propertiesElem, "Type", id);
+          AspectualTextRelation relation = new AspectualTextRelation(jCas);
+          addRelation(jCas, relation, sourceID, targetID, alinkType, idToAnnotation, id);
+
+        } else if (type.equals("Question")){
+        	String questionText = removeSingleChildText(propertiesElem, "Question", id);
+        	String confidence = removeSingleChildText(propertiesElem, "Confidence", id);
+        	String difficulty = removeSingleChildText(propertiesElem, "Difficulty", id);
+        	String questionDescription = questionText + " - Confidence: " + confidence + " - Difficulty: " + difficulty;
+        	
+        	List<Element> answers = propertiesElem.getChildren("Answer");
+        	List<String> ids = new ArrayList<>();
+        	for(Element answer : answers){
+        		ids.add(answer.getText());
+        	}
+        	propertiesElem.removeChildren("Answer");
+        	questionRelations.put(questionDescription, ids);
+        } else {
+          throw new UnsupportedOperationException("unsupported relation type: " + type);
+        }
+
+        // make sure all XML has been consumed
+        removeSingleChild(relationElem, "parentsType", id);
+        if (!propertiesElem.getChildren().isEmpty() || !relationElem.getChildren().isEmpty()) {
+          List<String> children = Lists.newArrayList();
+          for (Element child : propertiesElem.getChildren()) {
+            children.add(child.getName());
+          }
+          for (Element child : relationElem.getChildren()) {
+            children.add(child.getName());
+          }
+          error("unprocessed children " + children, id);
+        }
+      }
+      
+      // After reading in all the relations we can create the Question annotations
+      for(String question : questionRelations.keySet()){
+      	CollectionTextRelation qaRel = new CollectionTextRelation(jCas);
+      	qaRel.setCategory(question);
+        List<TOP> answerList = new ArrayList<>();
+      	for(String id : questionRelations.get(question)){
+      		TOP answer = idToAnnotation.get(id);
+      		answerList.add(answer);
+      	}
+      	qaRel.setMembers(ListFactory.buildList(jCas, answerList));
+      	qaRel.addToIndexes();
+      }
+    }
+  }
+  private static Element getSingleChild(Element elem, String elemName, String causeID) {
+    List<Element> children = elem.getChildren(elemName);
+    if (children.size() != 1) {
+      error(String.format("not exactly one '%s' child", elemName), causeID);
+    }
+    return children.size() > 0 ? children.get(0) : null;
+  }
+
+  private static Element removeSingleChild(Element elem, String elemName, String causeID) {
+    Element child = getSingleChild(elem, elemName, causeID);
+    elem.removeChildren(elemName);
+    return child;
+  }
+
+  private static String removeSingleChildText(Element elem, String elemName, String causeID) {
+    Element child = getSingleChild(elem, elemName, causeID);
+    String text = child.getText();
+    if (text.isEmpty()) {
+      error(String.format("an empty '%s' child", elemName), causeID);
+      text = null;
+    }
+    elem.removeChildren(elemName);
+    return text;
+  }
+
+  private static void addRelation(
+      JCas jCas,
+      BinaryTextRelation relation,
+      String sourceID,
+      String targetID,
+      String category,
+      Map<String, Annotation> idToAnnotation,
+      String causeID) {
+    if (sourceID != null && targetID != null) {
+      Annotation source = getArgument(sourceID, idToAnnotation, causeID);
+      Annotation target = getArgument(targetID, idToAnnotation, causeID);
+      if (source != null && target != null) {
+        RelationArgument sourceArg = new RelationArgument(jCas);
+        sourceArg.setArgument(source);
+        sourceArg.addToIndexes();
+        RelationArgument targetArg = new RelationArgument(jCas);
+        targetArg.setArgument(target);
+        targetArg.addToIndexes();
+        relation.setCategory(category);
+        relation.setArg1(sourceArg);
+        relation.setArg2(targetArg);
+        relation.addToIndexes();
+      }
+    }
+  }
+
+  private static Annotation getArgument(
+      String id,
+      Map<String, Annotation> idToAnnotation,
+      String causeID) {
+    Annotation annotation = idToAnnotation.get(id);
+    if (annotation == null) {
+      error("no annotation with id " + id, causeID);
+    }
+    return annotation;
+  }
+
+  private static void error(String found, String id) {
+    LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
+  }
+
+}

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/AnnotationIdCollection.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/AnnotationIdCollection.java?rev=1789408&r1=1789407&r2=1789408&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/AnnotationIdCollection.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/AnnotationIdCollection.java Wed Mar 29 19:30:54 2017
@@ -167,6 +167,8 @@ public class AnnotationIdCollection {
 		 * {@inheritDoc}
 		 */
 		public int compare( final Annotation arg1, final Annotation arg2 ) {
+			if(arg1 == null) return -1;
+			else if(arg2 == null) return 1;
 			final int startDiff = arg1.getBegin() - arg2.getBegin();
 			if ( startDiff != 0 ) {
 				return startDiff;

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TLinkTypeArray2.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TLinkTypeArray2.java?rev=1789408&r1=1789407&r2=1789408&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TLinkTypeArray2.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TLinkTypeArray2.java Wed Mar 29 19:30:54 2017
@@ -21,10 +21,12 @@ package org.apache.ctakes.temporal.utils
 import org.apache.ctakes.temporal.utils.AnnotationIdCollection;
 import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
 import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 
 
+import java.time.temporal.Temporal;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
@@ -154,7 +156,7 @@ public class TLinkTypeArray2 {
 	   arg1.setArgument(entityA);
 	   RelationArgument arg2 = new RelationArgument(jCas);
 	   arg2.setArgument(entityB);
-	   BinaryTextRelation relation = new BinaryTextRelation(jCas);
+	   TemporalTextRelation relation = new TemporalTextRelation(jCas);
 	   relation.setArg1(arg1);
 	   relation.setArg2(arg2);
 	   relation.setCategory(tlinkType.name().replace( "_", "-" ));//check if this is correct