You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2018/05/07 20:42:00 UTC
svn commit: r1831126 - in
/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference:
ae/ ae/features/cluster/ ae/features/salience/ ae/pairing/cluster/ eval/
flow/ util/
Author: tmill
Date: Mon May 7 20:42:00 2018
New Revision: 1831126
URL: http://svn.apache.org/viewvc?rev=1831126&view=rev
Log:
Changes to coreference resolution system to read cross-dcoument coreference gold standards and run cross-document coreference resolution.
Added:
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java
Removed:
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/flow/CoreferenceFlowController.java
Modified:
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java Mon May 7 20:42:00 2018
@@ -0,0 +1,171 @@
+package org.apache.ctakes.coreference.ae;
+
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+
+import java.util.*;
+
+/**
+ * Created by tmill on 4/18/18.
+ */
+@PipeBitInfo(
+ name = "Coreference Copier",
+ description = "Sets Modality based upon context.",
+ role = PipeBitInfo.Role.SPECIAL,
+ dependencies = { PipeBitInfo.TypeProduct.MARKABLE, PipeBitInfo.TypeProduct.COREFERENCE_RELATION, PipeBitInfo.TypeProduct.DEPENDENCY_NODE }
+)
+public class CopyCoreferenceRelations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+ private static Logger logger = Logger.getLogger(EvaluationOfEventCoreference.class);
+ private static final double DROPOUT_RATE = 0.1;
+
+ // TODO - make document aware for mention-cluster coreference? Not as easy as relation remover because this should work for
+ // non-document-aware annotators.
+ public static final String PARAM_GOLD_VIEW = "GoldViewName";
+ @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=false, description="View containing gold standard annotations")
+ private String goldViewName=EvaluationOfEventCoreference.GOLD_VIEW_NAME;
+
+ public static final String PARAM_DROP_ELEMENTS = "Dropout";
+ @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
+ private boolean dropout = false;
+
+ @SuppressWarnings("synthetic-access")
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ JCas goldView = null;
+ try {
+ goldView = jcas.getView(goldViewName);
+ } catch (CASException e) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException(e);
+ }
+ copyRelations(jcas, goldView, dropout);
+ }
+
+ public static void copyRelations(JCas jcas, JCas goldView, boolean dropout){
+
+ HashMap<Markable,Markable> gold2sys = new HashMap<>();
+ Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
+
+ for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
+ FSList head = goldChain.getMembers();
+ List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
+ boolean removeChain = false;
+
+ // first one is guaranteed to be nonempty otherwise it would not be in cas
+ do{
+ NonEmptyFSList element = (NonEmptyFSList) head;
+ Markable goldMarkable = (Markable) element.getHead();
+ if(goldMarkable == null){
+ logger.error(String.format("Found an unexpected null gold markable"));
+ }
+ boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
+
+ // if we can't align the gold markable with one in the system cas then don't add it:
+ if(!mapped){
+ String text = "<Out of bounds>";
+ if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+ text = goldMarkable.getCoveredText();
+ }
+ logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.",
+ text, goldMarkable.getBegin(), goldMarkable.getEnd()));
+ removeChain = true;
+ break;
+ }
+
+ Markable sysMarkable = gold2sys.get(goldMarkable);
+ if(!dropout || systemLists.size() == 0){
+ if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
+ systemLists.get(0).add(sysMarkable);
+ }else{
+ // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
+ if(Math.random() > DROPOUT_RATE){
+ // most of the time do the right thing:
+ systemLists.get(0).add(sysMarkable);
+ }else{
+ int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
+ if(listIndex == systemLists.size()){
+ systemLists.add(new ArrayList<>());
+ }
+ systemLists.get(listIndex).add(sysMarkable);
+ }
+ }
+ head = element.getTail();
+ }while(head instanceof NonEmptyFSList);
+
+ // don't bother copying over -- the gold chain was of person mentions
+ if(!removeChain){
+ for(List<Markable> chain : systemLists){
+ if(chain.size() > 1){
+ CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
+ sysRel.setMembers(ListFactory.buildList(jcas, chain));
+ sysRel.addToIndexes();
+ }
+ }
+ }
+ }
+
+ for(CoreferenceRelation goldRel : JCasUtil.select(goldView, CoreferenceRelation.class)){
+ if((gold2sys.containsKey(goldRel.getArg1().getArgument()) && gold2sys.containsKey(goldRel.getArg2().getArgument()))){
+ CoreferenceRelation sysRel = new CoreferenceRelation(jcas);
+ sysRel.setCategory(goldRel.getCategory());
+ sysRel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
+
+ RelationArgument arg1 = new RelationArgument(jcas);
+ arg1.setArgument(gold2sys.get(goldRel.getArg1().getArgument()));
+ sysRel.setArg1(arg1);
+ arg1.addToIndexes();
+
+ RelationArgument arg2 = new RelationArgument(jcas);
+ arg2.setArgument(gold2sys.get(goldRel.getArg2().getArgument()));
+ sysRel.setArg2(arg2);
+ arg2.addToIndexes();
+
+ sysRel.addToIndexes();
+ }
+ }
+ }
+
+ /* Fills in entries in a map for the gold markable passed in to the system markable.
+ Algorithm:
+ * Find dependency head for gold algorithm
+ * Iterate over the markables that span that head
+ * Check if any of those markables has the same head
+ * if so add it to the map and return true
+ */
+ public static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
+ if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+ for(Markable sysMarkable : depIndex.get(headNode)){
+ ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+ if(markNode == headNode){
+ gold2sys.put(goldMarkable, sysMarkable);
+ return true;
+ }
+ }
+ }else{
+ // Have seen some instances where anafora writes a span that is not possible, log them
+ // so they can be found and fixed:
+ logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n",
+ goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
+ return false;
+ }
+ return false;
+ }
+}
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Mon May 7 20:42:00 2018
@@ -10,6 +10,7 @@ import org.apache.ctakes.coreference.ae.
import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
import org.apache.ctakes.coreference.util.MarkableCacheRelationExtractor;
import org.apache.ctakes.coreference.util.MarkableUtilities;
+import org.apache.ctakes.coreference.util.ThymeCasOrderer;
import org.apache.ctakes.dependency.parser.util.DependencyUtility;
import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
@@ -203,10 +204,15 @@ public class MentionClusterCoreferenceAn
protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
JCas jcas,
- Markable mention){
+ Markable mention,
+ JCas prevCas){
LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
- pairs.addAll(pairer.getPairs(jcas, mention));
+ if(prevCas != null && pairer instanceof CrossDocumentPairer_ImplBase){
+ pairs.addAll(((CrossDocumentPairer_ImplBase)pairer).getPairs(jcas, mention, prevCas));
+ }else {
+ pairs.addAll(pairer.getPairs(jcas, mention));
+ }
}
return pairs;
@@ -237,16 +243,19 @@ public class MentionClusterCoreferenceAn
LOGGER.info( "Finding Coreferences ..." );
// It is possible that the cas for an entire patient has been passed through. Try to process all documents.
- final Collection<JCas> docViews = PatientViewUtil.getDocumentViews( jCas );
- if ( docViews.isEmpty() ) {
+ final Collection<JCas> views = PatientViewUtil.getDocumentViews( jCas );
+ if ( views.isEmpty() ) {
// There is only one document in the cas - the default
- processDocument( jCas );
+ processDocument( jCas, null );
LOGGER.info( "Finished." );
return;
}
+ JCas prevView = null;
try ( DotLogger dotter = new DotLogger() ) {
- for ( JCas view : docViews ) {
- processDocument( view );
+ for ( JCas view : ThymeCasOrderer.getOrderedCases(jCas) ) {
+ LOGGER.info("Processing document with view name: " + view.getViewName());
+ processDocument( view, prevView );
+ prevView = view;
}
} catch ( IOException ioE ) {
LOGGER.error( ioE.getMessage() );
@@ -254,7 +263,7 @@ public class MentionClusterCoreferenceAn
LOGGER.info( "Finished." );
}
- private void processDocument( final JCas jCas ) throws AnalysisEngineProcessException {
+ private void processDocument( final JCas jCas, final JCas prevCas ) throws AnalysisEngineProcessException {
// lookup from pair of annotations to binary text relation
// note: assumes that there will be at most one relation per pair
Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation>
@@ -292,7 +301,7 @@ public class MentionClusterCoreferenceAn
CollectionTextRelation maxCluster = null;
String mentionView = mention.getView().getViewName();
- for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention ) ) {
+ for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention, prevCas ) ) {
CollectionTextRelation cluster = pair.getCluster();
Markable firstElement = JCasUtil.select(cluster.getMembers(), Markable.class).iterator().next();
String clusterHeadView = firstElement.getView().getViewName();
@@ -337,6 +346,10 @@ public class MentionClusterCoreferenceAn
// create a classification instance and write it to the training data
this.dataWriter.write( new Instance<>( category, features ) );
if ( !category.equals( NO_RELATION_CATEGORY ) ) {
+// LOGGER.warn("Coref training: Writing link between mention: " + mention.getCoveredText() + " and previous cluster containing mention: " + firstElement.getCoveredText());
+ if(!clusterHeadView.equals(mentionView)){
+ LOGGER.info("Writing positive instance linking mention [" + mention.getCoveredText() + "] to cluster with elements from previous document");
+ }
singleton = false;
break;
}
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java Mon May 7 20:42:00 2018
@@ -0,0 +1,301 @@
+package org.apache.ctakes.coreference.ae;
+
+import com.google.common.collect.Maps;
+import org.apache.ctakes.core.patient.AbstractPatientConsumer;
+import org.apache.ctakes.core.patient.PatientNoteStore;
+import org.apache.ctakes.core.patient.PatientViewUtil;
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.core.util.SourceMetadataUtil;
+import org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by tmill on 2/22/18.
+ */
+public class ThymeAnaforaCrossDocCorefXmlReader extends AbstractPatientConsumer {
+
+ public static final String PARAM_XML_DIRECTORY = "XmlDirectory";
+ @ConfigurationParameter(
+ name = PARAM_XML_DIRECTORY,
+ description = "Directory containing cross-document coreference annotations"
+ )String xmlDir;
+
+ public static final String PARAM_IS_TRAINING = "IsTraining";
+ @ConfigurationParameter(
+ name = PARAM_IS_TRAINING,
+ description = "Whether this reader is being called at training or test time, and thus whether gold annotations should be put in document or gold CAS"
+ )boolean isTraining;
+
+ private static final String NAME = ThymeAnaforaCrossDocCorefXmlReader.class.getSimpleName();
+ private static final Logger LOGGER = Logger.getLogger(ThymeAnaforaCrossDocCorefXmlReader.class);
+
+ public ThymeAnaforaCrossDocCorefXmlReader(){
+ super(NAME,
+ "Reads gold standard cross-document coreference annotations in the format created for the THYME project, using the Anafora tool.");
+ }
+
+ public static AnalysisEngineDescription getDescription(String xmlDir, boolean training) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(ThymeAnaforaCrossDocCorefXmlReader.class,
+ ThymeAnaforaCrossDocCorefXmlReader.PARAM_XML_DIRECTORY,
+ xmlDir,
+ ThymeAnaforaCrossDocCorefXmlReader.PARAM_IS_TRAINING,
+ training);
+ }
+
+ @Override
+ public String getEngineName() {
+ return NAME + (isTraining?"_training":"_test");
+ }
+
+ @Override
+ protected void processPatientCas(JCas patientJcas) throws AnalysisEngineProcessException {
+ String patientName = SourceMetadataUtil.getPatientIdentifier( patientJcas );
+ String xmlFilename = String.format("%s.Thyme2v1-PostProc.gold.completed.xml", patientName);
+ File annotationDir = null;
+ for(String subdir : new String[]{"Train", "Dev", "Test"}){
+ annotationDir = new File(new File(this.xmlDir, subdir), patientName);
+ if(annotationDir.exists()) break;
+ }
+ if(annotationDir == null){
+ System.err.println("Could not find a cross-doc coreference file for patient: " + patientName + " in the specified directory: " + this.xmlDir);
+ throw new AnalysisEngineProcessException();
+ }
+ File annotationFile = new File(annotationDir, xmlFilename);
+ if(!annotationFile.exists()){
+// LOGGER.warn("No *PostProc.gold.completed.xml file for this patient... trying Correction...");
+// xmlFilename = String.format("%s.Thyme2v1-Correction.gold.completed.xml", patientName);
+// annotationFile = new File(annotationDir, xmlFilename);
+// if (!annotationFile.exists()) {
+ LOGGER.error("No *Correction.gold.completed.xml file exists for this patient either... please remove from dataset");
+ throw new AnalysisEngineProcessException();
+// }
+ }
+ Map<String,String> notes = new HashMap<>();
+
+ for(File file : annotationDir.listFiles()){
+ if(file.isDirectory()){
+ String fileContents = null;
+ File noteFile = new File(file, file.getName());
+ try {
+ fileContents = new String(Files.readAllBytes(Paths.get(noteFile.toURI())));
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException(e);
+ }
+ notes.put(file.getName(), fileContents);
+ }
+ }
+ processXmlfile(patientJcas, annotationFile, notes);
+ }
+
+ private void processXmlfile(JCas patientJcas, File xmlFile, Map<String,String> notes) throws AnalysisEngineProcessException {
+ // load the XML
+ Element dataElem;
+ try {
+ dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+ } catch (MalformedURLException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (JDOMException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+ HashMap<String,Integer> docLens = new HashMap<>();
+ notes.forEach((k,v) -> docLens.put(k, v.length()));
+ HashMap<String,JCas> docCases = new HashMap<>();
+ HashMap<String,JCas> goldCases = new HashMap<>();
+ for(String docName : notes.keySet()) {
+ for (JCas docView : PatientViewUtil.getAllViews(patientJcas)) {
+ if (docView.getViewName().contains(docName) && docView.getViewName().contains(CAS.NAME_DEFAULT_SOFA)) {
+ docCases.put(docName, docView);
+ break;
+ }
+ }
+ for(JCas goldView : PatientViewUtil.getAllViews(patientJcas)){
+ if(goldView.getViewName().contains(docName) && goldView.getViewName().contains(PatientViewUtil.GOLD_PREFIX)) {
+ goldCases.put(docName, goldView);
+ }
+ }
+ }
+ for (Element annotationsElem : dataElem.getChildren("annotations")) {
+ // keep track of entity ids as we read entities so that we can find them from the map annotations later:
+ Map<String, Annotation> idToAnnotation = Maps.newHashMap();
+
+ for (Element entityElem : annotationsElem.getChildren("entity")) {
+ String id = removeSingleChildText(entityElem, "id", null);
+ String[] parts = id.split("@");
+ String entNum = parts[0]; // note-specific id for this entity
+ String entNoteName = parts[2]; // which note is this entity in: e.g., ID001_clinic_001
+ String entAnnot = parts[3]; // should be "gold" for gold
+ String entNote = notes.get(entNoteName);
+ JCas entCas = goldCases.get(entNoteName);
+ int docLen = entNote.length();
+ Element spanElem = removeSingleChild(entityElem, "span", id);
+ String type = removeSingleChildText(entityElem, "type", id);
+ Element propertiesElem = removeSingleChild(entityElem, "properties", id);
+
+ // UIMA doesn't support disjoint spans, so take the span enclosing
+ // everything
+ int begin = Integer.MAX_VALUE;
+ int end = Integer.MIN_VALUE;
+ for (String spanString : spanElem.getText().split(";")) {
+ String[] beginEndStrings = spanString.split(",");
+ if (beginEndStrings.length != 2) {
+ error("span not of the format 'number,number'", id);
+ }
+ int spanBegin = Integer.parseInt(beginEndStrings[0]);
+ int spanEnd = Integer.parseInt(beginEndStrings[1]);
+ if (spanBegin < begin && spanBegin >= 0) {
+ begin = spanBegin;
+ }
+ if (spanEnd > end && spanEnd <= docLen) {
+ end = spanEnd;
+ }
+ }
+ if (begin < 0 || end > docLen || end < 0) {
+ error("Illegal begin or end boundary", id);
+ continue;
+ }
+
+ Annotation annotation = null;
+ if (type.equals("Markable")) {
+ while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
+ end--;
+ }
+ if(begin < 0 || end < 0){
+ error("Illegal negative span", id);
+ }
+ Markable markable = new Markable(entCas, begin, end);
+ markable.addToIndexes();
+ annotation = markable;
+
+ } else {
+ LOGGER.warn(String.format("Skipping entity type %s because the handler hasn't been written.", type));
+ }
+ if (annotation != null) idToAnnotation.put(id, annotation);
+ }
+
+ for (Element relationElem : annotationsElem.getChildren("relation")) {
+ String id = removeSingleChildText(relationElem, "id", null);
+ String[] parts = id.split("@");
+ String relNum = parts[0]; // note-specific id for this entity
+ String relNoteName = parts[2]; // which note is this entity in: e.g., ID001_clinic_001
+ String relAnnot = parts[3]; // should be "gold" for gold
+ String relNote = notes.get(relNoteName);
+ JCas relCas = goldCases.get(relNoteName);
+ String type = removeSingleChildText(relationElem, "type", id);
+ Element propertiesElem = removeSingleChild(relationElem, "properties", id);
+
+ if (type.equals("Identical")) {
+ // Build list of Markables from FirstInstance and Coreferring_String annotations:
+ String mention = removeSingleChildText(propertiesElem, "FirstInstance", id);
+ List<Markable> markables = new ArrayList<>();
+ Markable antecedent, anaphor;
+ antecedent = (Markable) idToAnnotation.get(mention);
+ if(antecedent != null){
+ markables.add(antecedent);
+ }else{
+ error("Null markable as FirstInstance", id);
+ }
+ List<Element> corefs = propertiesElem.getChildren("Coreferring_String");
+ for(Element coref : corefs){
+ mention = coref.getText();
+ anaphor = (Markable) idToAnnotation.get(mention);
+ if(anaphor != null){
+ markables.add(anaphor);
+ }else{
+ error("Null markable as Coreferring_String", id);
+ }
+ }
+ // Iterate over markable list creating binary coref relations:
+ for(int antInd = 0; antInd < markables.size()-1; antInd++){
+ int anaInd = antInd + 1;
+ // create set of binary relations from chain elements:
+ CoreferenceRelation pair = new CoreferenceRelation(relCas);
+ pair.setCategory("Identity");
+ RelationArgument arg1 = new RelationArgument(relCas);
+ arg1.setArgument(markables.get(antInd));
+ arg1.setRole("antecedent");
+ pair.setArg1(arg1);
+ RelationArgument arg2 = new RelationArgument(relCas);
+ arg2.setArgument(markables.get(anaInd));
+ arg2.setRole("anaphor");
+ pair.setArg2(arg2);
+ pair.addToIndexes();
+ }
+ // Create FSList from markable list and add to collection text relation:
+ if(markables.size() > 1){
+ CollectionTextRelation chain = new CollectionTextRelation(relCas);
+ FSList list = ListFactory.buildList(relCas, markables);
+ list.addToIndexes();
+ chain.setMembers(list);
+ chain.addToIndexes();
+ }else{
+ error("Coreference chain of length <= 1", id);
+ }
+ propertiesElem.removeChildren("Coreferring_String");
+ }else{
+ LOGGER.warn(String.format("This script cannot process relations of type %s yet.", type));
+ }
+ }
+ }
+ }
+
+ private static Element getSingleChild(Element elem, String elemName, String causeID) {
+ List<Element> children = elem.getChildren(elemName);
+ if (children.size() != 1) {
+ error(String.format("not exactly one '%s' child", elemName), causeID);
+ }
+ return children.size() > 0 ? children.get(0) : null;
+ }
+
+ private static Element removeSingleChild(Element elem, String elemName, String causeID) {
+ Element child = getSingleChild(elem, elemName, causeID);
+ elem.removeChildren(elemName);
+ return child;
+ }
+
+ private static String removeSingleChildText(Element elem, String elemName, String causeID) {
+ Element child = getSingleChild(elem, elemName, causeID);
+ String text = child.getText();
+ if (text.isEmpty()) {
+ error(String.format("an empty '%s' child", elemName), causeID);
+ text = null;
+ }
+ elem.removeChildren(elemName);
+ return text;
+ }
+
+ private static void error(String found, String id) {
+ LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
+ }
+
+}
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java Mon May 7 20:42:00 2018
@@ -32,7 +32,7 @@ public class MentionClusterAgreementFeat
throw new RuntimeException("This extractor requires a call to setCache()");
}
List<Feature> features = new ArrayList<>();
-
+
String s = mention.getCoveredText().toLowerCase();
boolean isDem = isDemonstrative(s);
boolean isDef = isDefinite(s);
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java Mon May 7 20:42:00 2018
@@ -28,6 +28,9 @@ public class MentionClusterStackFeatures
NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
Annotation mostRecent = ClusterUtils.getMostRecent(members, mention);
+ if(mostRecent == null){
+ return feats;
+ }
int mentionEnd = mostRecent.getEnd();
int numIntervening = 0;
int numNonSingletonIntervening = 0;
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java Mon May 7 20:42:00 2018
@@ -47,7 +47,7 @@ public class MorphosyntacticFeatureExtra
feats.add(new Feature("MorphoIsPronoun", false));
}
- feats.add(new Feature("MorphoIsProper", head.getPostag().equals("NNP")));
+ feats.add(new Feature("MorphoIsProper", (head != null && head.getPostag() != null && head.getPostag().equals("NNP"))));
// skip animacy and person features for now -- planning to not do person mentions
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java Mon May 7 20:42:00 2018
@@ -0,0 +1,19 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.jcas.JCas;
+
+import java.util.List;
+
+/**
+ * Created by tmill on 3/22/18.
+ */
+public abstract class CrossDocumentPairer_ImplBase extends ClusterMentionPairer_ImplBase{
+ public abstract List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m, JCas prevCas);
+
+ @Override
+ public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m){
+ return getPairs(jcas, m, null);
+ }
+}
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java Mon May 7 20:42:00 2018
@@ -19,7 +19,7 @@
package org.apache.ctakes.coreference.ae.pairing.cluster;
import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
-import org.apache.ctakes.temporal.utils.PatientViewsUtil;
+import org.apache.ctakes.coreference.util.ThymeCasOrderer;
import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
import org.apache.ctakes.typesystem.type.textsem.Markable;
import org.apache.uima.fit.util.JCasUtil;
@@ -31,11 +31,10 @@ import java.util.List;
/**
* Created by tmill on 9/21/17.
*/
-public class PreviousDocumentPairer extends ClusterMentionPairer_ImplBase {
+public class PreviousDocumentPairer extends CrossDocumentPairer_ImplBase {
@Override
- public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m) {
+ public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m, JCas prevCas) {
List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> clusters = new ArrayList<>();
- JCas prevCas = PatientViewsUtil.getPreviousDocumentCas(jcas);
if(prevCas == null) return clusters;
for(CollectionTextRelation chain : JCasUtil.select(prevCas, CollectionTextRelation.class)){
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Mon May 7 20:42:00 2018
@@ -7,6 +7,7 @@ import com.google.common.collect.Sets;
import com.lexicalscope.jewel.cli.CliFactory;
import com.lexicalscope.jewel.cli.Option;
import de.bwaldvogel.liblinear.FeatureNode;
+import org.apache.commons.lang.NotImplementedException;
import org.apache.ctakes.assertion.medfacts.cleartk.*;
import org.apache.ctakes.core.config.ConfigParameterConstants;
import org.apache.ctakes.core.patient.AbstractPatientConsumer;
@@ -58,6 +59,7 @@ import org.apache.uima.fit.factory.Aggre
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.pipeline.JCasIterator;
import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
@@ -85,6 +87,7 @@ import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
public class EvaluationOfEventCoreference extends EvaluationOfTemporalRelations_ImplBase {
@@ -253,7 +256,7 @@ public class EvaluationOfEventCoreferenc
aggregateBuilder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
+// aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
// aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
@@ -261,21 +264,18 @@ public class EvaluationOfEventCoreferenc
aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
if(this.goldMarkables){
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class));
+ throw new NotImplementedException("Using gold markables needs to be rewritten to be compatible with patient-level annotations.");
+// aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class));
}else{
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
// aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
}
- // MarkableHeadTreeCreator creates a cache of mappings from Markables to dependency heads since so many feature extractors use that information
- // major speedup
-// aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
- // the coreference module uses segments to index markables, but we don't have them in the gold standard.
aggregateBuilder.add(CopyFromSystem.getDescription(Segment.class), GOLD_VIEW_NAME, GOLD_VIEW_NAME);
aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
+ aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
aggregateBuilder.add(EventCoreferenceAnnotator.createDataWriterDescription(
// TKSVMlightStringOutcomeDataWriter.class,
FlushingDataWriter.class,
@@ -287,6 +287,8 @@ public class EvaluationOfEventCoreferenc
Logger.getLogger(EventCoreferenceAnnotator.class).setLevel(Level.WARN);
}else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientNoteCollector.class));
+ aggregateBuilder.add(ThymeAnaforaCrossDocCorefXmlReader.getDescription(this.xmlDirectory.getAbsolutePath(), true ) );
+ aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCrossDocCoreferenceRelations.class));
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
PatientMentionClusterCoreferencer.class,
CleartkAnnotator.PARAM_IS_TRAINING,
@@ -300,6 +302,7 @@ public class EvaluationOfEventCoreferenc
MentionClusterCoreferenceAnnotator.PARAM_SINGLE_DOCUMENT,
false));
}else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
+ aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createDataWriterDescription(
SvmLightRankDataWriter.class,
directory,
@@ -308,9 +311,6 @@ public class EvaluationOfEventCoreferenc
logger.warn("Encountered a training configuration that does not add an annotator: " + this.evalType);
}
- // If we are using mention-cluster algorithm, it is aware of multiple documents so we only have to call it once.
- // FlowControllerDescription corefFlowControl = FlowControllerFactory.createFlowControllerDescription(CoreferenceFlowController.class);
- // aggregateBuilder.setFlowControllerDescription(corefFlowControl);
AnalysisEngineDescription aed = aggregateBuilder.createAggregateDescription();
SimplePipeline.runPipeline(collectionReader, AnalysisEngineFactory.createEngine(aed));
}
@@ -350,7 +350,6 @@ public class EvaluationOfEventCoreferenc
AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIdFromURI.class));
aggregateBuilder.add("Patient id printer", AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
-// AggregateBuilder singleNoteBuilder = new AggregateBuilder();
aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
@@ -362,15 +361,10 @@ public class EvaluationOfEventCoreferenc
aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
-// singleNoteBuilder.add(AnalysisEngineFactory.createEngineDescription(CoreferenceChainScoringOutput.class,
-// ConfigParameterConstants.PARAM_OUTPUTDIR,
-// this.outputDirectory + File.separator + goldOut,
-// CoreferenceChainScoringOutput.PARAM_GOLD_VIEW_NAME,
-// goldViewName),
-// CAS.NAME_DEFAULT_SOFA,
-// viewName);
+
if(this.goldMarkables){
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class)); //CopyFromGold.getDescription(Markable.class));
+ throw new NotImplementedException("Using gold markables needs to be rewritten to be compatible with patient-level annotations.");
+// aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class)); //CopyFromGold.getDescription(Markable.class));
}else{
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
// aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
@@ -379,7 +373,8 @@ public class EvaluationOfEventCoreferenc
aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER) {
// Do nothing but we still need this here so the else clause works right
- aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(EvaluationPatientNoteCollector.class));
+ aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientNoteCollector.class));
+ aggregateBuilder.add(ThymeAnaforaCrossDocCorefXmlReader.getDescription(this.xmlDirectory.getAbsolutePath(), false));
aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientMentionClusterCoreferencer.class,
CleartkAnnotator.PARAM_IS_TRAINING,
false,
@@ -716,155 +711,92 @@ public class EvaluationOfEventCoreferenc
}
@PipeBitInfo(
- name = "Coreference Copier",
- description = "Sets Modality based upon context.",
+ name = "CrossDoc Coreference Copier",
+ description = "Copies markables and relations from gold to system view",
role = PipeBitInfo.Role.SPECIAL,
dependencies = { PipeBitInfo.TypeProduct.MARKABLE, PipeBitInfo.TypeProduct.COREFERENCE_RELATION }
)
- public static class CopyCoreferenceRelations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
- // TODO - make document aware for mention-cluster coreference? Not as easy as relation remover because this should work for
- // non-document-aware annotators.
- public static final String PARAM_GOLD_VIEW = "GoldViewName";
- @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=false, description="View containing gold standard annotations")
- private String goldViewName=GOLD_VIEW_NAME;
-
- public static final String PARAM_DROP_ELEMENTS = "Dropout";
- @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
- private boolean dropout = false;
+ public static class CopyCrossDocCoreferenceRelations extends AbstractPatientConsumer {
+
+ public CopyCrossDocCoreferenceRelations() {
+ super("CopyCrossDocCoreferenceRelations", "Copy gold coreference relations from gold cas to system cas for training");
+ }
- @SuppressWarnings("synthetic-access")
@Override
- public void process(JCas jcas) throws AnalysisEngineProcessException {
- JCas goldView = null;
- try {
- goldView = jcas.getView(goldViewName);
- } catch (CASException e) {
- e.printStackTrace();
- throw new AnalysisEngineProcessException(e);
- }
-
- HashMap<Markable,Markable> gold2sys = new HashMap<>();
- Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
- // remove those with removed markables (person mentions)
- List<CollectionTextRelation> toRemove = new ArrayList<>();
-
- for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
- FSList head = goldChain.getMembers();
-// NonEmptyFSList sysList = new NonEmptyFSList(jcas);
-// NonEmptyFSList listEnd = sysList;
- List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
- boolean removeChain = false;
- List<Markable> prevList = null;
-
- // first one is guaranteed to be nonempty otherwise it would not be in cas
- do{
- NonEmptyFSList element = (NonEmptyFSList) head;
- Markable goldMarkable = (Markable) element.getHead();
- if(goldMarkable == null){
- logger.error(String.format("Found an unexpected null gold markable"));
- }
- boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
-
- // if we can't align the gold markable with one in the system cas then don't add it:
- if(!mapped){
- String text = "<Out of bounds>";
- if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
- text = goldMarkable.getCoveredText();
- }
- logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.",
- text, goldMarkable.getBegin(), goldMarkable.getEnd()));
- removeChain = true;
- break;
- }
-
- Markable sysMarkable = gold2sys.get(goldMarkable);
- if(!dropout || systemLists.size() == 0){
- if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
- systemLists.get(0).add(sysMarkable);
-// prevList = systemLists.get(0);
-// // if this is not first time through move listEnd to end.
-// if(listEnd.getHead() != null){
-// listEnd.setTail(new NonEmptyFSList(jcas));
-// listEnd.addToIndexes();
-// listEnd = (NonEmptyFSList) listEnd.getTail();
-// }
-//
-// // add markable to end of list:
-// listEnd.setHead(gold2sys.get(goldMarkable));
- }else{
- // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
- if(Math.random() > DROPOUT_RATE){
- // most of the time do the right thing:
- systemLists.get(0).add(sysMarkable);
- }else{
- int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
- if(listIndex == systemLists.size()){
- systemLists.add(new ArrayList<>());
- }
- systemLists.get(listIndex).add(sysMarkable);
- }
- }
- head = element.getTail();
- }while(head instanceof NonEmptyFSList);
-
- // don't bother copying over -- the gold chain was of person mentions
- if(!removeChain){
-// listEnd.setTail(new EmptyFSList(jcas));
-// listEnd.addToIndexes();
-// listEnd.getTail().addToIndexes();
-// sysList.addToIndexes();
- for(List<Markable> chain : systemLists){
- if(chain.size() > 1){
- CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
- sysRel.setMembers(ListFactory.buildList(jcas, chain));
- sysRel.addToIndexes();
- }
+ public String getEngineName() {
+ return "CopyCrossDocCoreferenceRelations";
+ }
+
+ @Override
+ public void initialize(final UimaContext context) throws ResourceInitializationException {
+ super.initialize(context);
+ }
+
+ @Override
+ protected void processPatientCas(JCas patientJcas) throws AnalysisEngineProcessException {
+ Collection<JCas> docCases = PatientViewUtil.getAllViews(patientJcas).
+ stream().
+ filter(s -> (s.getViewName().contains(CAS.NAME_DEFAULT_SOFA) && !s.getViewName().equals(CAS.NAME_DEFAULT_SOFA))).
+ collect(Collectors.toList());
+ Collection<JCas> goldCases = PatientViewUtil.getAllViews(patientJcas).
+ stream().
+ filter(s -> s.getViewName().contains(GOLD_VIEW_NAME)).
+ collect(Collectors.toList());
+ Map<Markable, Markable> gold2sys = new HashMap<>();
+
+ // Map all markables in gold cases to equivalents in docCases
+ for (JCas goldCas : goldCases) {
+ JCas docCas = getAlignedDocCas(docCases, goldCas);
+ if (docCas == null) {
+ logger.error("Could not find aligned document CAS for this gold CAS.");
+ throw new AnalysisEngineProcessException();
+ }
+ Map<ConllDependencyNode, Collection<Markable>> depIndex = JCasUtil.indexCovering(docCas, ConllDependencyNode.class, Markable.class);
+
+ for (Markable goldMarkable : JCasUtil.select(goldCas, Markable.class)) {
+ boolean match = CopyCoreferenceRelations.mapGoldMarkable(docCas, goldMarkable, gold2sys, depIndex);
+ if (!match) {
+ logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.",
+ goldMarkable.getCoveredText(), goldMarkable.getBegin(), goldMarkable.getEnd()));
+
}
}
}
-
- for(CoreferenceRelation goldRel : JCasUtil.select(goldView, CoreferenceRelation.class)){
- if((gold2sys.containsKey(goldRel.getArg1().getArgument()) && gold2sys.containsKey(goldRel.getArg2().getArgument()))){
- CoreferenceRelation sysRel = new CoreferenceRelation(jcas);
- sysRel.setCategory(goldRel.getCategory());
- sysRel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
-
- RelationArgument arg1 = new RelationArgument(jcas);
- arg1.setArgument(gold2sys.get(goldRel.getArg1().getArgument()));
- sysRel.setArg1(arg1);
- arg1.addToIndexes();
-
- RelationArgument arg2 = new RelationArgument(jcas);
- arg2.setArgument(gold2sys.get(goldRel.getArg2().getArgument()));
- sysRel.setArg2(arg2);
- arg2.addToIndexes();
-
- sysRel.addToIndexes();
+ // now go through all gold chains:
+ for (JCas goldCas : goldCases) {
+ JCas docCas = getAlignedDocCas(docCases, goldCas);
+ if (docCas == null) {
+ logger.error("Could not find aligned document CAS for this gold CAS.");
+ throw new AnalysisEngineProcessException();
+ }
+ // create system chains from all the mapped markables
+ for (CollectionTextRelation chain : JCasUtil.select(goldCas, CollectionTextRelation.class)) {
+ ArrayList<Markable> mappedElements = new ArrayList<>();
+ for (Markable goldElement : JCasUtil.select(chain.getMembers(), Markable.class)) {
+ Markable sysElement = gold2sys.get(goldElement);
+ if (sysElement != null) mappedElements.add(sysElement);
+ }
+ if (mappedElements.size() <= 1) {
+ logger.warn("Gold chain did not have enough markables map to system markables.");
+ } else {
+ CollectionTextRelation sysChain = new CollectionTextRelation(docCas);
+ sysChain.setMembers(FSCollectionFactory.createFSList(docCas, mappedElements));
+ sysChain.addToIndexes();
+ }
}
}
}
-
- private static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
- if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
-
-
- ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
- for(Markable sysMarkable : depIndex.get(headNode)){
- ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
- if(markNode == headNode){
- gold2sys.put(goldMarkable, sysMarkable);
- return true;
- }
+ private static JCas getAlignedDocCas(Collection<JCas> docCases, JCas goldCas) {
+ JCas docCas = null;
+
+ for (JCas candidate : docCases) {
+ if (goldCas.getViewName().replace(GOLD_VIEW_NAME, CAS.NAME_DEFAULT_SOFA).equals(candidate.getViewName())) {
+ docCas = candidate;
+ break;
}
- }else{
- // Have seen some instances where anafora writes a span that is not possible, log them
- // so they can be found and fixed:
- logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n",
- goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
- return false;
}
- return false;
+ return docCas;
}
}
@@ -914,7 +846,7 @@ public class EvaluationOfEventCoreferenc
!markable.getCoveredText().toLowerCase().equals("it")){
toRemove.add(markable);
}else if(coveredTokens.size() > 0 && (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||
- coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms."))){
+ coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms.") || coveredTokens.get(0).getCoveredText().startsWith("Miss"))){
toRemove.add(markable);
}else if(markable.getCoveredText().toLowerCase().endsWith("patient") || markable.getCoveredText().toLowerCase().equals("pt")){
toRemove.add(markable);
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java Mon May 7 20:42:00 2018
@@ -0,0 +1,52 @@
+package org.apache.ctakes.coreference.util;
+
+import org.apache.ctakes.core.patient.PatientViewUtil;
+import org.apache.ctakes.coreference.ae.ThymeAnaforaCrossDocCorefXmlReader;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by tmill on 3/22/18.
+ */
+public class ThymeCasOrderer implements Comparator<JCas> {
+ private static Pattern fnPatt = Pattern.compile("ID(\\d+)_([^_]+)_(\\d+)");
+ static ThymeCasOrderer sorter = new ThymeCasOrderer();
+
+ // TODO: Maybe this should just be done when we read them in?
+ public static List<JCas> getOrderedCases(JCas jCas) {
+ List<JCas> cases = new ArrayList<>();
+
+ Collection<JCas> allViews = PatientViewUtil.getAllViews(jCas);
+ for(JCas jcas : allViews){
+ // contains the default CAS name but isn't _exactly_ the default CAS name (that would be the main patient cas)
+ if(jcas.getViewName().contains(CAS.NAME_DEFAULT_SOFA) &&
+ jcas.getViewName().length() > CAS.NAME_DEFAULT_SOFA.length()){
+ cases.add(jcas);
+ }
+ }
+ // TODO: Resort this based on last item of name (e.g. ID001_clinic_003 use 003 as its index)
+ Collections.sort(cases, sorter);
+ return cases;
+ }
+
+ @Override
+ public int compare(JCas cas0, JCas cas1) {
+ String v0 = cas0.getViewName();
+ String v1 = cas1.getViewName();
+ Matcher m = fnPatt.matcher(v0);
+ int doc0Id = -1, doc1Id = -1;
+ if(m.find()){
+ doc0Id = Integer.parseInt(m.group(3));
+ }
+ m = fnPatt.matcher(v1);
+ if(m.find()){
+ doc1Id = Integer.parseInt(m.group(3));
+ }
+
+ return doc0Id - doc1Id;
+ }
+}