You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2018/05/07 20:42:00 UTC
svn commit: r1831126 - in /ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference: ae/ ae/features/cluster/ ae/features/salience/ ae/pairing/cluster/ eval/ flow/ util/

Author: tmill
Date: Mon May  7 20:42:00 2018
New Revision: 1831126

URL: http://svn.apache.org/viewvc?rev=1831126&view=rev
Log:
Changes to coreference resolution system to read cross-dcoument coreference gold standards and run cross-document coreference resolution.

Added:
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java
Removed:
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/flow/CoreferenceFlowController.java
Modified:
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CopyCoreferenceRelations.java Mon May  7 20:42:00 2018
@@ -0,0 +1,171 @@
+package org.apache.ctakes.coreference.ae;
+
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+
+import java.util.*;
+
+/**
+ * Created by tmill on 4/18/18.
+ */
+@PipeBitInfo(
+        name = "Coreference Copier",
+        description = "Sets Modality based upon context.",
+        role = PipeBitInfo.Role.SPECIAL,
+        dependencies = { PipeBitInfo.TypeProduct.MARKABLE, PipeBitInfo.TypeProduct.COREFERENCE_RELATION, PipeBitInfo.TypeProduct.DEPENDENCY_NODE }
+)
+public class CopyCoreferenceRelations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    private static Logger logger = Logger.getLogger(EvaluationOfEventCoreference.class);
+    private static final double DROPOUT_RATE = 0.1;
+
+    // TODO - make document aware for mention-cluster coreference? Not as easy as relation remover because this should work for
+    // non-document-aware annotators.
+    public static final String PARAM_GOLD_VIEW = "GoldViewName";
+    @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=false, description="View containing gold standard annotations")
+    private String goldViewName=EvaluationOfEventCoreference.GOLD_VIEW_NAME;
+
+    public static final String PARAM_DROP_ELEMENTS = "Dropout";
+    @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
+    private boolean dropout = false;
+
+    @SuppressWarnings("synthetic-access")
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+        JCas goldView = null;
+        try {
+            goldView = jcas.getView(goldViewName);
+        } catch (CASException e) {
+            e.printStackTrace();
+            throw new AnalysisEngineProcessException(e);
+        }
+        copyRelations(jcas, goldView, dropout);
+    }
+
+    public static void copyRelations(JCas jcas, JCas goldView, boolean dropout){
+
+        HashMap<Markable,Markable> gold2sys = new HashMap<>();
+        Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
+
+        for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
+            FSList head = goldChain.getMembers();
+            List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
+            boolean removeChain = false;
+
+            // first one is guaranteed to be nonempty otherwise it would not be in cas
+            do{
+                NonEmptyFSList element = (NonEmptyFSList) head;
+                Markable goldMarkable = (Markable) element.getHead();
+                if(goldMarkable == null){
+                    logger.error(String.format("Found an unexpected null gold markable"));
+                }
+                boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
+
+                // if we can't align the gold markable with one in the system cas then don't add it:
+                if(!mapped){
+                    String text = "<Out of bounds>";
+                    if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+                        text = goldMarkable.getCoveredText();
+                    }
+                    logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.",
+                            text, goldMarkable.getBegin(), goldMarkable.getEnd()));
+                    removeChain = true;
+                    break;
+                }
+
+                Markable sysMarkable = gold2sys.get(goldMarkable);
+                if(!dropout || systemLists.size() == 0){
+                    if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
+                    systemLists.get(0).add(sysMarkable);
+                }else{
+                    // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
+                    if(Math.random() > DROPOUT_RATE){
+                        // most of the time do the right thing:
+                        systemLists.get(0).add(sysMarkable);
+                    }else{
+                        int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
+                        if(listIndex == systemLists.size()){
+                            systemLists.add(new ArrayList<>());
+                        }
+                        systemLists.get(listIndex).add(sysMarkable);
+                    }
+                }
+                head = element.getTail();
+            }while(head instanceof NonEmptyFSList);
+
+            // don't bother copying over -- the gold chain was of person mentions
+            if(!removeChain){
+                for(List<Markable> chain : systemLists){
+                    if(chain.size() > 1){
+                        CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
+                        sysRel.setMembers(ListFactory.buildList(jcas, chain));
+                        sysRel.addToIndexes();
+                    }
+                }
+            }
+        }
+
+        for(CoreferenceRelation goldRel : JCasUtil.select(goldView, CoreferenceRelation.class)){
+            if((gold2sys.containsKey(goldRel.getArg1().getArgument()) && gold2sys.containsKey(goldRel.getArg2().getArgument()))){
+                CoreferenceRelation sysRel = new CoreferenceRelation(jcas);
+                sysRel.setCategory(goldRel.getCategory());
+                sysRel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
+
+                RelationArgument arg1 = new RelationArgument(jcas);
+                arg1.setArgument(gold2sys.get(goldRel.getArg1().getArgument()));
+                sysRel.setArg1(arg1);
+                arg1.addToIndexes();
+
+                RelationArgument arg2 = new RelationArgument(jcas);
+                arg2.setArgument(gold2sys.get(goldRel.getArg2().getArgument()));
+                sysRel.setArg2(arg2);
+                arg2.addToIndexes();
+
+                sysRel.addToIndexes();
+            }
+        }
+    }
+
+    /* Fills in entries in a map for the gold markable passed in to the system markable.
+        Algorithm:
+         * Find dependency head for gold algorithm
+         * Iterate over the markables that span that head
+         * Check if any of those markables has the same head
+         * if so add it to the map and return true
+     */
+    public static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
+        if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+            ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+            for(Markable sysMarkable : depIndex.get(headNode)){
+                ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+                if(markNode == headNode){
+                    gold2sys.put(goldMarkable, sysMarkable);
+                    return true;
+                }
+            }
+        }else{
+            // Have seen some instances where anafora writes a span that is not possible, log them
+            // so they can be found and fixed:
+            logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n",
+                    goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
+            return false;
+        }
+        return false;
+    }
+}

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Mon May  7 20:42:00 2018
@@ -10,6 +10,7 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
 import org.apache.ctakes.coreference.util.MarkableCacheRelationExtractor;
 import org.apache.ctakes.coreference.util.MarkableUtilities;
+import org.apache.ctakes.coreference.util.ThymeCasOrderer;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
@@ -203,10 +204,15 @@ public class MentionClusterCoreferenceAn
 
   protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
           JCas jcas,
-          Markable mention){
+          Markable mention,
+          JCas prevCas){
     LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
     for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
-      pairs.addAll(pairer.getPairs(jcas, mention));
+      if(prevCas != null && pairer instanceof CrossDocumentPairer_ImplBase){
+        pairs.addAll(((CrossDocumentPairer_ImplBase)pairer).getPairs(jcas, mention, prevCas));
+      }else {
+        pairs.addAll(pairer.getPairs(jcas, mention));
+      }
     }
 
     return pairs;
@@ -237,16 +243,19 @@ public class MentionClusterCoreferenceAn
     LOGGER.info( "Finding Coreferences ..." );
 
     // It is possible that the cas for an entire patient has been passed through.  Try to process all documents.
-    final Collection<JCas> docViews = PatientViewUtil.getDocumentViews( jCas );
-    if ( docViews.isEmpty() ) {
+    final Collection<JCas> views = PatientViewUtil.getDocumentViews( jCas );
+    if ( views.isEmpty() ) {
       // There is only one document in the cas - the default
-      processDocument( jCas );
+      processDocument( jCas, null );
       LOGGER.info( "Finished." );
       return;
     }
+    JCas prevView = null;
     try ( DotLogger dotter = new DotLogger() ) {
-      for ( JCas view : docViews ) {
-        processDocument( view );
+      for ( JCas view : ThymeCasOrderer.getOrderedCases(jCas) ) {
+        LOGGER.info("Processing document with view name: " + view.getViewName());
+        processDocument( view, prevView );
+        prevView = view;
       }
     } catch ( IOException ioE ) {
       LOGGER.error( ioE.getMessage() );
@@ -254,7 +263,7 @@ public class MentionClusterCoreferenceAn
     LOGGER.info( "Finished." );
   }
 
-  private void processDocument( final JCas jCas ) throws AnalysisEngineProcessException {
+  private void processDocument( final JCas jCas, final JCas prevCas ) throws AnalysisEngineProcessException {
     // lookup from pair of annotations to binary text relation
     // note: assumes that there will be at most one relation per pair
     Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation>
@@ -292,7 +301,7 @@ public class MentionClusterCoreferenceAn
         CollectionTextRelation maxCluster = null;
         String mentionView = mention.getView().getViewName();
 
-        for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention ) ) {
+        for ( CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs( jCas, mention, prevCas ) ) {
           CollectionTextRelation cluster = pair.getCluster();
           Markable firstElement = JCasUtil.select(cluster.getMembers(), Markable.class).iterator().next();
           String clusterHeadView = firstElement.getView().getViewName();
@@ -337,6 +346,10 @@ public class MentionClusterCoreferenceAn
             // create a classification instance and write it to the training data
             this.dataWriter.write( new Instance<>( category, features ) );
             if ( !category.equals( NO_RELATION_CATEGORY ) ) {
+//              LOGGER.warn("Coref training: Writing link between mention: " + mention.getCoveredText() + " and previous cluster containing mention: " + firstElement.getCoveredText());
+              if(!clusterHeadView.equals(mentionView)){
+                LOGGER.info("Writing positive instance linking mention [" + mention.getCoveredText() + "] to cluster with elements from previous document");
+              }
               singleton = false;
               break;
             }

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/ThymeAnaforaCrossDocCorefXmlReader.java Mon May  7 20:42:00 2018
@@ -0,0 +1,301 @@
+package org.apache.ctakes.coreference.ae;
+
+import com.google.common.collect.Maps;
+import org.apache.ctakes.core.patient.AbstractPatientConsumer;
+import org.apache.ctakes.core.patient.PatientNoteStore;
+import org.apache.ctakes.core.patient.PatientViewUtil;
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.core.util.SourceMetadataUtil;
+import org.apache.ctakes.temporal.ae.THYMEAnaforaXMLReader;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by tmill on 2/22/18.
+ */
+public class ThymeAnaforaCrossDocCorefXmlReader extends AbstractPatientConsumer {
+
+    public static final String PARAM_XML_DIRECTORY = "XmlDirectory";
+    @ConfigurationParameter(
+            name = PARAM_XML_DIRECTORY,
+            description = "Directory containing cross-document coreference annotations"
+    )String xmlDir;
+
+    public static final String PARAM_IS_TRAINING = "IsTraining";
+    @ConfigurationParameter(
+            name = PARAM_IS_TRAINING,
+            description = "Whether this reader is being called at training or test time, and thus whether gold annotations should be put in document or gold CAS"
+    )boolean isTraining;
+
+    private static final String NAME = ThymeAnaforaCrossDocCorefXmlReader.class.getSimpleName();
+    private static final Logger LOGGER = Logger.getLogger(ThymeAnaforaCrossDocCorefXmlReader.class);
+
+    public ThymeAnaforaCrossDocCorefXmlReader(){
+        super(NAME,
+                "Reads gold standard cross-document coreference annotations in the format created for the THYME project, using the Anafora tool.");
+    }
+
+    public static AnalysisEngineDescription getDescription(String xmlDir, boolean training) throws ResourceInitializationException {
+        return AnalysisEngineFactory.createEngineDescription(ThymeAnaforaCrossDocCorefXmlReader.class,
+                ThymeAnaforaCrossDocCorefXmlReader.PARAM_XML_DIRECTORY,
+                xmlDir,
+                ThymeAnaforaCrossDocCorefXmlReader.PARAM_IS_TRAINING,
+                training);
+    }
+
+    @Override
+    public String getEngineName() {
+        return NAME + (isTraining?"_training":"_test");
+    }
+
+    @Override
+    protected void processPatientCas(JCas patientJcas) throws AnalysisEngineProcessException {
+        String patientName = SourceMetadataUtil.getPatientIdentifier( patientJcas );
+        String xmlFilename = String.format("%s.Thyme2v1-PostProc.gold.completed.xml", patientName);
+        File annotationDir = null;
+        for(String subdir : new String[]{"Train", "Dev", "Test"}){
+            annotationDir = new File(new File(this.xmlDir, subdir), patientName);
+            if(annotationDir.exists()) break;
+        }
+        if(annotationDir == null){
+            System.err.println("Could not find a cross-doc coreference file for patient: " + patientName + " in the specified directory: " + this.xmlDir);
+            throw new AnalysisEngineProcessException();
+        }
+        File annotationFile = new File(annotationDir, xmlFilename);
+        if(!annotationFile.exists()){
+//            LOGGER.warn("No *PostProc.gold.completed.xml file for this patient... trying Correction...");
+//            xmlFilename = String.format("%s.Thyme2v1-Correction.gold.completed.xml", patientName);
+//            annotationFile = new File(annotationDir, xmlFilename);
+//            if (!annotationFile.exists()) {
+            LOGGER.error("No *Correction.gold.completed.xml file exists for this patient either... please remove from dataset");
+            throw new AnalysisEngineProcessException();
+//            }
+        }
+        Map<String,String> notes = new HashMap<>();
+
+        for(File file : annotationDir.listFiles()){
+            if(file.isDirectory()){
+                String fileContents = null;
+                File noteFile = new File(file, file.getName());
+                try {
+                    fileContents = new String(Files.readAllBytes(Paths.get(noteFile.toURI())));
+                } catch (IOException e) {
+                    e.printStackTrace();
+                    throw new AnalysisEngineProcessException(e);
+                }
+                notes.put(file.getName(), fileContents);
+            }
+        }
+        processXmlfile(patientJcas, annotationFile, notes);
+    }
+
+    private void processXmlfile(JCas patientJcas, File xmlFile, Map<String,String> notes) throws AnalysisEngineProcessException {
+        // load the XML
+        Element dataElem;
+        try {
+            dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+        } catch (MalformedURLException e) {
+            throw new AnalysisEngineProcessException(e);
+        } catch (JDOMException e) {
+            throw new AnalysisEngineProcessException(e);
+        } catch (IOException e) {
+            throw new AnalysisEngineProcessException(e);
+        }
+        HashMap<String,Integer> docLens = new HashMap<>();
+        notes.forEach((k,v) -> docLens.put(k, v.length()));
+        HashMap<String,JCas> docCases = new HashMap<>();
+        HashMap<String,JCas> goldCases = new HashMap<>();
+        for(String docName : notes.keySet()) {
+            for (JCas docView : PatientViewUtil.getAllViews(patientJcas)) {
+                if (docView.getViewName().contains(docName) && docView.getViewName().contains(CAS.NAME_DEFAULT_SOFA)) {
+                    docCases.put(docName, docView);
+                    break;
+                }
+            }
+            for(JCas goldView : PatientViewUtil.getAllViews(patientJcas)){
+                if(goldView.getViewName().contains(docName) && goldView.getViewName().contains(PatientViewUtil.GOLD_PREFIX)) {
+                    goldCases.put(docName, goldView);
+                }
+            }
+        }
+        for (Element annotationsElem : dataElem.getChildren("annotations")) {
+            // keep track of entity ids as we read entities so that we can find them from the map annotations later:
+            Map<String, Annotation> idToAnnotation = Maps.newHashMap();
+
+            for (Element entityElem : annotationsElem.getChildren("entity")) {
+                String id = removeSingleChildText(entityElem, "id", null);
+                String[] parts = id.split("@");
+                String entNum = parts[0];   // note-specific id for this entity
+                String entNoteName = parts[2];  // which note is this entity in: e.g., ID001_clinic_001
+                String entAnnot = parts[3]; // should be "gold" for gold
+                String entNote = notes.get(entNoteName);
+                JCas entCas = goldCases.get(entNoteName);
+                int docLen = entNote.length();
+                Element spanElem = removeSingleChild(entityElem, "span", id);
+                String type = removeSingleChildText(entityElem, "type", id);
+                Element propertiesElem = removeSingleChild(entityElem, "properties", id);
+
+                // UIMA doesn't support disjoint spans, so take the span enclosing
+                // everything
+                int begin = Integer.MAX_VALUE;
+                int end = Integer.MIN_VALUE;
+                for (String spanString : spanElem.getText().split(";")) {
+                    String[] beginEndStrings = spanString.split(",");
+                    if (beginEndStrings.length != 2) {
+                        error("span not of the format 'number,number'", id);
+                    }
+                    int spanBegin = Integer.parseInt(beginEndStrings[0]);
+                    int spanEnd = Integer.parseInt(beginEndStrings[1]);
+                    if (spanBegin < begin && spanBegin >= 0) {
+                        begin = spanBegin;
+                    }
+                    if (spanEnd > end && spanEnd <= docLen) {
+                        end = spanEnd;
+                    }
+                }
+                if (begin < 0 || end > docLen || end < 0) {
+                    error("Illegal begin or end boundary", id);
+                    continue;
+                }
+
+                Annotation annotation = null;
+                if (type.equals("Markable")) {
+                    while (end >= begin && (entNote.charAt(end - 1) == '\n' || entNote.charAt(end - 1) == '\r')) {
+                        end--;
+                    }
+                    if(begin < 0 || end < 0){
+                        error("Illegal negative span", id);
+                    }
+                    Markable markable = new Markable(entCas, begin, end);
+                    markable.addToIndexes();
+                    annotation = markable;
+
+                } else {
+                    LOGGER.warn(String.format("Skipping entity type %s because the handler hasn't been written.", type));
+                }
+                if (annotation != null) idToAnnotation.put(id, annotation);
+            }
+
+            for (Element relationElem : annotationsElem.getChildren("relation")) {
+                String id = removeSingleChildText(relationElem, "id", null);
+                String[] parts = id.split("@");
+                String relNum = parts[0];   // note-specific id for this entity
+                String relNoteName = parts[2];  // which note is this entity in: e.g., ID001_clinic_001
+                String relAnnot = parts[3]; // should be "gold" for gold
+                String relNote = notes.get(relNoteName);
+                JCas relCas = goldCases.get(relNoteName);
+                String type = removeSingleChildText(relationElem, "type", id);
+                Element propertiesElem = removeSingleChild(relationElem, "properties", id);
+
+                if (type.equals("Identical")) {
+                    // Build list of Markables from FirstInstance and Coreferring_String annotations:
+                    String mention = removeSingleChildText(propertiesElem, "FirstInstance", id);
+                    List<Markable> markables = new ArrayList<>();
+                    Markable antecedent, anaphor;
+                    antecedent = (Markable) idToAnnotation.get(mention);
+                    if(antecedent != null){
+                        markables.add(antecedent);
+                    }else{
+                        error("Null markable as FirstInstance", id);
+                    }
+                    List<Element> corefs = propertiesElem.getChildren("Coreferring_String");
+                    for(Element coref : corefs){
+                        mention = coref.getText();
+                        anaphor = (Markable) idToAnnotation.get(mention);
+                        if(anaphor != null){
+                            markables.add(anaphor);
+                        }else{
+                            error("Null markable as Coreferring_String", id);
+                        }
+                    }
+                    // Iterate over markable list creating binary coref relations:
+                    for(int antInd = 0; antInd < markables.size()-1; antInd++){
+                        int anaInd = antInd + 1;
+                        // create set of binary relations from chain elements:
+                        CoreferenceRelation pair = new CoreferenceRelation(relCas);
+                        pair.setCategory("Identity");
+                        RelationArgument arg1 = new RelationArgument(relCas);
+                        arg1.setArgument(markables.get(antInd));
+                        arg1.setRole("antecedent");
+                        pair.setArg1(arg1);
+                        RelationArgument arg2 = new RelationArgument(relCas);
+                        arg2.setArgument(markables.get(anaInd));
+                        arg2.setRole("anaphor");
+                        pair.setArg2(arg2);
+                        pair.addToIndexes();
+                    }
+                    // Create FSList from markable list and add to collection text relation:
+                    if(markables.size() > 1){
+                        CollectionTextRelation chain = new CollectionTextRelation(relCas);
+                        FSList list = ListFactory.buildList(relCas, markables);
+                        list.addToIndexes();
+                        chain.setMembers(list);
+                        chain.addToIndexes();
+                    }else{
+                        error("Coreference chain of length <= 1", id);
+                    }
+                    propertiesElem.removeChildren("Coreferring_String");
+                }else{
+                    LOGGER.warn(String.format("This script cannot process relations of type %s yet.", type));
+                }
+            }
+        }
+    }
+
+    private static Element getSingleChild(Element elem, String elemName, String causeID) {
+        List<Element> children = elem.getChildren(elemName);
+        if (children.size() != 1) {
+            error(String.format("not exactly one '%s' child", elemName), causeID);
+        }
+        return children.size() > 0 ? children.get(0) : null;
+    }
+
+    private static Element removeSingleChild(Element elem, String elemName, String causeID) {
+        Element child = getSingleChild(elem, elemName, causeID);
+        elem.removeChildren(elemName);
+        return child;
+    }
+
+    private static String removeSingleChildText(Element elem, String elemName, String causeID) {
+        Element child = getSingleChild(elem, elemName, causeID);
+        String text = child.getText();
+        if (text.isEmpty()) {
+            error(String.format("an empty '%s' child", elemName), causeID);
+            text = null;
+        }
+        elem.removeChildren(elemName);
+        return text;
+    }
+
+    private static void error(String found, String id) {
+        LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
+    }
+
+}

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java Mon May  7 20:42:00 2018
@@ -32,7 +32,7 @@ public class MentionClusterAgreementFeat
       throw new RuntimeException("This extractor requires a call to setCache()");
     }
     List<Feature> features = new ArrayList<>();
-    
+
     String s = mention.getCoveredText().toLowerCase();
     boolean isDem = isDemonstrative(s);
     boolean isDef = isDefinite(s);

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java Mon May  7 20:42:00 2018
@@ -28,6 +28,9 @@ public class MentionClusterStackFeatures
     
     NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
     Annotation mostRecent = ClusterUtils.getMostRecent(members, mention);
+    if(mostRecent == null){
+      return feats;
+    }
     int mentionEnd = mostRecent.getEnd();
     int numIntervening = 0;
     int numNonSingletonIntervening = 0;

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java Mon May  7 20:42:00 2018
@@ -47,7 +47,7 @@ public class MorphosyntacticFeatureExtra
       feats.add(new Feature("MorphoIsPronoun", false));
     }
     
-    feats.add(new Feature("MorphoIsProper", head.getPostag().equals("NNP")));
+    feats.add(new Feature("MorphoIsProper", (head != null && head.getPostag() != null && head.getPostag().equals("NNP"))));
     
     // skip animacy and person features for now -- planning to not do person mentions
     

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/CrossDocumentPairer_ImplBase.java Mon May  7 20:42:00 2018
@@ -0,0 +1,19 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.jcas.JCas;
+
+import java.util.List;
+
+/**
+ * Created by tmill on 3/22/18.
+ */
+public abstract class CrossDocumentPairer_ImplBase extends ClusterMentionPairer_ImplBase{
+    public abstract List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m, JCas prevCas);
+
+    @Override
+    public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m){
+        return getPairs(jcas, m, null);
+    }
+}

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/PreviousDocumentPairer.java Mon May  7 20:42:00 2018
@@ -19,7 +19,7 @@
 package org.apache.ctakes.coreference.ae.pairing.cluster;
 
 import org.apache.ctakes.coreference.util.ClusterMentionFetcher;
-import org.apache.ctakes.temporal.utils.PatientViewsUtil;
+import org.apache.ctakes.coreference.util.ThymeCasOrderer;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
 import org.apache.uima.fit.util.JCasUtil;
@@ -31,11 +31,10 @@ import java.util.List;
 /**
  * Created by tmill on 9/21/17.
  */
-public class PreviousDocumentPairer extends ClusterMentionPairer_ImplBase {
+public class PreviousDocumentPairer extends CrossDocumentPairer_ImplBase {
     @Override
-    public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m) {
+    public List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m, JCas prevCas) {
         List<ClusterMentionFetcher.CollectionTextRelationIdentifiedAnnotationPair> clusters = new ArrayList<>();
-        JCas prevCas = PatientViewsUtil.getPreviousDocumentCas(jcas);
         if(prevCas == null) return clusters;
 
         for(CollectionTextRelation chain : JCasUtil.select(prevCas, CollectionTextRelation.class)){

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1831126&r1=1831125&r2=1831126&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Mon May  7 20:42:00 2018
@@ -7,6 +7,7 @@ import com.google.common.collect.Sets;
 import com.lexicalscope.jewel.cli.CliFactory;
 import com.lexicalscope.jewel.cli.Option;
 import de.bwaldvogel.liblinear.FeatureNode;
+import org.apache.commons.lang.NotImplementedException;
 import org.apache.ctakes.assertion.medfacts.cleartk.*;
 import org.apache.ctakes.core.config.ConfigParameterConstants;
 import org.apache.ctakes.core.patient.AbstractPatientConsumer;
@@ -58,6 +59,7 @@ import org.apache.uima.fit.factory.Aggre
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
 import org.apache.uima.fit.pipeline.JCasIterator;
 import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.FSCollectionFactory;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
@@ -85,6 +87,7 @@ import java.io.*;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.stream.Collectors;
 
 public class EvaluationOfEventCoreference extends EvaluationOfTemporalRelations_ImplBase {
  
@@ -253,7 +256,7 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
       aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
 
-      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
+//      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
       //      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
@@ -261,21 +264,18 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
       aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
       if(this.goldMarkables){
-        aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class));
+        throw new NotImplementedException("Using gold markables needs to be rewritten to be compatible with patient-level annotations.");
+//        aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class));
       }else{
         aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
         //    aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
         aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
       }
-      // MarkableHeadTreeCreator creates a cache of mappings from Markables to dependency heads since so many feature extractors use that information
-      // major speedup
-//      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
-      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
-      // the coreference module uses segments to index markables, but we don't have them in the gold standard.
       aggregateBuilder.add(CopyFromSystem.getDescription(Segment.class), GOLD_VIEW_NAME, GOLD_VIEW_NAME);
 
       aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
       if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
+        aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
         aggregateBuilder.add(EventCoreferenceAnnotator.createDataWriterDescription(
             //        TKSVMlightStringOutcomeDataWriter.class,
             FlushingDataWriter.class,
@@ -287,6 +287,8 @@ public class EvaluationOfEventCoreferenc
         Logger.getLogger(EventCoreferenceAnnotator.class).setLevel(Level.WARN);
       }else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
         aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientNoteCollector.class));
+        aggregateBuilder.add(ThymeAnaforaCrossDocCorefXmlReader.getDescription(this.xmlDirectory.getAbsolutePath(), true ) );
+        aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCrossDocCoreferenceRelations.class));
         aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
                 PatientMentionClusterCoreferencer.class,
                 CleartkAnnotator.PARAM_IS_TRAINING,
@@ -300,6 +302,7 @@ public class EvaluationOfEventCoreferenc
                 MentionClusterCoreferenceAnnotator.PARAM_SINGLE_DOCUMENT,
                 false));
       }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
+        aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class), CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME);
         aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createDataWriterDescription(
             SvmLightRankDataWriter.class,
             directory,
@@ -308,9 +311,6 @@ public class EvaluationOfEventCoreferenc
         logger.warn("Encountered a training configuration that does not add an annotator: " + this.evalType);
       }
 
-      // If we are using mention-cluster algorithm, it is aware of multiple documents so we only have to call it once.
-      //      FlowControllerDescription corefFlowControl = FlowControllerFactory.createFlowControllerDescription(CoreferenceFlowController.class);
-      //      aggregateBuilder.setFlowControllerDescription(corefFlowControl);
       AnalysisEngineDescription aed = aggregateBuilder.createAggregateDescription();
       SimplePipeline.runPipeline(collectionReader, AnalysisEngineFactory.createEngine(aed));
     }
@@ -350,7 +350,6 @@ public class EvaluationOfEventCoreferenc
     AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIdFromURI.class));
     aggregateBuilder.add("Patient id printer", AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
-//      AggregateBuilder singleNoteBuilder = new AggregateBuilder();
     aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
@@ -362,15 +361,10 @@ public class EvaluationOfEventCoreferenc
     aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
     aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
     aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
-//      singleNoteBuilder.add(AnalysisEngineFactory.createEngineDescription(CoreferenceChainScoringOutput.class,
-//          ConfigParameterConstants.PARAM_OUTPUTDIR,
-//          this.outputDirectory + File.separator + goldOut,
-//          CoreferenceChainScoringOutput.PARAM_GOLD_VIEW_NAME,
-//          goldViewName),
-//          CAS.NAME_DEFAULT_SOFA,
-//          viewName);
+
     if(this.goldMarkables){
-      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class)); //CopyFromGold.getDescription(Markable.class));
+      throw new NotImplementedException("Using gold markables needs to be rewritten to be compatible with patient-level annotations.");
+//      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyGoldMarkablesInChains.class)); //CopyFromGold.getDescription(Markable.class));
     }else{
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
       //    aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
@@ -379,7 +373,8 @@ public class EvaluationOfEventCoreferenc
     aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
     if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER) {
       // Do nothing but we still need this here so the else clause works right
-      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(EvaluationPatientNoteCollector.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientNoteCollector.class));
+      aggregateBuilder.add(ThymeAnaforaCrossDocCorefXmlReader.getDescription(this.xmlDirectory.getAbsolutePath(), false));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(PatientMentionClusterCoreferencer.class,
               CleartkAnnotator.PARAM_IS_TRAINING,
               false,
@@ -716,155 +711,92 @@ public class EvaluationOfEventCoreferenc
   }
 
   @PipeBitInfo(
-        name = "Coreference Copier",
-        description = "Sets Modality based upon context.",
+        name = "CrossDoc Coreference Copier",
+        description = "Copies markables and relations from gold to system view",
         role = PipeBitInfo.Role.SPECIAL,
         dependencies = { PipeBitInfo.TypeProduct.MARKABLE, PipeBitInfo.TypeProduct.COREFERENCE_RELATION }
   )
-  public static class CopyCoreferenceRelations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
-    // TODO - make document aware for mention-cluster coreference? Not as easy as relation remover because this should work for
-    // non-document-aware annotators.
-    public static final String PARAM_GOLD_VIEW = "GoldViewName";
-    @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=false, description="View containing gold standard annotations")
-    private String goldViewName=GOLD_VIEW_NAME;
-    
-    public static final String PARAM_DROP_ELEMENTS = "Dropout";
-    @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
-    private boolean dropout = false;
+  public static class CopyCrossDocCoreferenceRelations extends AbstractPatientConsumer {
+
+    public CopyCrossDocCoreferenceRelations() {
+      super("CopyCrossDocCoreferenceRelations", "Copy gold coreference relations from gold cas to system cas for training");
+    }
 
-    @SuppressWarnings("synthetic-access")
     @Override
-    public void process(JCas jcas) throws AnalysisEngineProcessException {
-      JCas goldView = null;
-      try {
-        goldView = jcas.getView(goldViewName);
-      } catch (CASException e) {
-        e.printStackTrace();
-        throw new AnalysisEngineProcessException(e);
-      }
-      
-      HashMap<Markable,Markable> gold2sys = new HashMap<>();
-      Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
-      // remove those with removed markables (person mentions)
-      List<CollectionTextRelation> toRemove = new ArrayList<>();
-      
-      for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
-        FSList head = goldChain.getMembers();
-//        NonEmptyFSList sysList = new NonEmptyFSList(jcas);
-//        NonEmptyFSList listEnd = sysList;
-        List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
-        boolean removeChain = false;
-        List<Markable> prevList = null;
-        
-        // first one is guaranteed to be nonempty otherwise it would not be in cas
-        do{
-          NonEmptyFSList element = (NonEmptyFSList) head;
-          Markable goldMarkable = (Markable) element.getHead();
-          if(goldMarkable == null){
-            logger.error(String.format("Found an unexpected null gold markable"));
-          }
-          boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
-          
-          // if we can't align the gold markable with one in the system cas then don't add it:
-          if(!mapped){
-            String text = "<Out of bounds>";
-            if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
-              text = goldMarkable.getCoveredText();
-            }
-            logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.", 
-                text, goldMarkable.getBegin(), goldMarkable.getEnd()));
-            removeChain = true;
-            break;
-          }
-          
-          Markable sysMarkable = gold2sys.get(goldMarkable);
-          if(!dropout || systemLists.size() == 0){
-            if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
-            systemLists.get(0).add(sysMarkable);
-//            prevList = systemLists.get(0);
-//            // if this is not first time through move listEnd to end.
-//            if(listEnd.getHead() != null){
-//              listEnd.setTail(new NonEmptyFSList(jcas));
-//              listEnd.addToIndexes();
-//              listEnd = (NonEmptyFSList) listEnd.getTail();
-//            }
-//
-//            // add markable to end of list:
-//            listEnd.setHead(gold2sys.get(goldMarkable));
-          }else{
-            // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
-            if(Math.random() > DROPOUT_RATE){
-              // most of the time do the right thing:
-              systemLists.get(0).add(sysMarkable);
-            }else{
-              int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
-              if(listIndex == systemLists.size()){
-                systemLists.add(new ArrayList<>());
-              }
-              systemLists.get(listIndex).add(sysMarkable);
-            }
-          }
-          head = element.getTail();
-        }while(head instanceof NonEmptyFSList);
-        
-        // don't bother copying over -- the gold chain was of person mentions
-        if(!removeChain){
-//          listEnd.setTail(new EmptyFSList(jcas));
-//          listEnd.addToIndexes();
-//          listEnd.getTail().addToIndexes();
-//          sysList.addToIndexes();
-          for(List<Markable> chain : systemLists){
-            if(chain.size() > 1){
-              CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
-              sysRel.setMembers(ListFactory.buildList(jcas, chain));
-              sysRel.addToIndexes();
-            }
+    public String getEngineName() {
+      return "CopyCrossDocCoreferenceRelations";
+    }
+
+    @Override
+    public void initialize(final UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+    }
+
+    @Override
+    protected void processPatientCas(JCas patientJcas) throws AnalysisEngineProcessException {
+      Collection<JCas> docCases = PatientViewUtil.getAllViews(patientJcas).
+              stream().
+              filter(s -> (s.getViewName().contains(CAS.NAME_DEFAULT_SOFA) && !s.getViewName().equals(CAS.NAME_DEFAULT_SOFA))).
+              collect(Collectors.toList());
+      Collection<JCas> goldCases = PatientViewUtil.getAllViews(patientJcas).
+              stream().
+              filter(s -> s.getViewName().contains(GOLD_VIEW_NAME)).
+              collect(Collectors.toList());
+      Map<Markable, Markable> gold2sys = new HashMap<>();
+
+      // Map all markables in gold cases to equivalents in docCases
+      for (JCas goldCas : goldCases) {
+        JCas docCas = getAlignedDocCas(docCases, goldCas);
+        if (docCas == null) {
+          logger.error("Could not find aligned document CAS for this gold CAS.");
+          throw new AnalysisEngineProcessException();
+        }
+        Map<ConllDependencyNode, Collection<Markable>> depIndex = JCasUtil.indexCovering(docCas, ConllDependencyNode.class, Markable.class);
+
+        for (Markable goldMarkable : JCasUtil.select(goldCas, Markable.class)) {
+          boolean match = CopyCoreferenceRelations.mapGoldMarkable(docCas, goldMarkable, gold2sys, depIndex);
+          if (!match) {
+            logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.",
+                    goldMarkable.getCoveredText(), goldMarkable.getBegin(), goldMarkable.getEnd()));
+
           }
         }
       }
-      
-      for(CoreferenceRelation goldRel : JCasUtil.select(goldView, CoreferenceRelation.class)){
-        if((gold2sys.containsKey(goldRel.getArg1().getArgument()) && gold2sys.containsKey(goldRel.getArg2().getArgument()))){
-          CoreferenceRelation sysRel = new CoreferenceRelation(jcas);
-          sysRel.setCategory(goldRel.getCategory());
-          sysRel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
-
-          RelationArgument arg1 = new RelationArgument(jcas);
-          arg1.setArgument(gold2sys.get(goldRel.getArg1().getArgument()));
-          sysRel.setArg1(arg1);
-          arg1.addToIndexes();
-
-          RelationArgument arg2 = new RelationArgument(jcas);
-          arg2.setArgument(gold2sys.get(goldRel.getArg2().getArgument()));
-          sysRel.setArg2(arg2);
-          arg2.addToIndexes();         
-          
-          sysRel.addToIndexes();        
+      // now go through all gold chains:
+      for (JCas goldCas : goldCases) {
+        JCas docCas = getAlignedDocCas(docCases, goldCas);
+        if (docCas == null) {
+          logger.error("Could not find aligned document CAS for this gold CAS.");
+          throw new AnalysisEngineProcessException();
+        }
+        // create system chains from all the mapped markables
+        for (CollectionTextRelation chain : JCasUtil.select(goldCas, CollectionTextRelation.class)) {
+          ArrayList<Markable> mappedElements = new ArrayList<>();
+          for (Markable goldElement : JCasUtil.select(chain.getMembers(), Markable.class)) {
+            Markable sysElement = gold2sys.get(goldElement);
+            if (sysElement != null) mappedElements.add(sysElement);
+          }
+          if (mappedElements.size() <= 1) {
+            logger.warn("Gold chain did not have enough markables map to system markables.");
+          } else {
+            CollectionTextRelation sysChain = new CollectionTextRelation(docCas);
+            sysChain.setMembers(FSCollectionFactory.createFSList(docCas, mappedElements));
+            sysChain.addToIndexes();
+          }
         }
       }
     }
-    
-    private static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
-      if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
-        
-        
-        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
 
-        for(Markable sysMarkable : depIndex.get(headNode)){
-          ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
-          if(markNode == headNode){
-            gold2sys.put(goldMarkable, sysMarkable);
-            return true;
-          }
+    private static JCas getAlignedDocCas(Collection<JCas> docCases, JCas goldCas) {
+      JCas docCas = null;
+
+      for (JCas candidate : docCases) {
+        if (goldCas.getViewName().replace(GOLD_VIEW_NAME, CAS.NAME_DEFAULT_SOFA).equals(candidate.getViewName())) {
+          docCas = candidate;
+          break;
         }
-      }else{
-        // Have seen some instances where anafora writes a span that is not possible, log them
-        // so they can be found and fixed:
-        logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n", 
-            goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
-        return false;
       }
-      return false;
+      return docCas;
     }
   }
   
@@ -914,7 +846,7 @@ public class EvaluationOfEventCoreferenc
             !markable.getCoveredText().toLowerCase().equals("it")){
           toRemove.add(markable);
         }else if(coveredTokens.size() > 0 && (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||
-                coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms."))){
+                coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms.") || coveredTokens.get(0).getCoveredText().startsWith("Miss"))){
           toRemove.add(markable);
         }else if(markable.getCoveredText().toLowerCase().endsWith("patient") || markable.getCoveredText().toLowerCase().equals("pt")){
           toRemove.add(markable);

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java?rev=1831126&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ThymeCasOrderer.java Mon May  7 20:42:00 2018
@@ -0,0 +1,52 @@
+package org.apache.ctakes.coreference.util;
+
+import org.apache.ctakes.core.patient.PatientViewUtil;
+import org.apache.ctakes.coreference.ae.ThymeAnaforaCrossDocCorefXmlReader;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by tmill on 3/22/18.
+ */
+public class ThymeCasOrderer implements Comparator<JCas> {
+    private static Pattern fnPatt = Pattern.compile("ID(\\d+)_([^_]+)_(\\d+)");
+    static ThymeCasOrderer sorter = new ThymeCasOrderer();
+
+    // TODO: Maybe this should just be done when we read them in?
+    public static List<JCas> getOrderedCases(JCas jCas) {
+        List<JCas> cases = new ArrayList<>();
+
+        Collection<JCas> allViews = PatientViewUtil.getAllViews(jCas);
+        for(JCas jcas : allViews){
+            // contains the default CAS name but isn't _exactly_ the default CAS name (that would be the main patient cas)
+            if(jcas.getViewName().contains(CAS.NAME_DEFAULT_SOFA) &&
+                    jcas.getViewName().length() > CAS.NAME_DEFAULT_SOFA.length()){
+                cases.add(jcas);
+            }
+        }
+        // TODO: Resort this based on last item of name (e.g. ID001_clinic_003 use 003 as its index)
+        Collections.sort(cases, sorter);
+        return cases;
+    }
+
+    @Override
+    public int compare(JCas cas0, JCas cas1) {
+        String v0 = cas0.getViewName();
+        String v1 = cas1.getViewName();
+        Matcher m = fnPatt.matcher(v0);
+        int doc0Id = -1, doc1Id = -1;
+        if(m.find()){
+            doc0Id = Integer.parseInt(m.group(3));
+        }
+        m = fnPatt.matcher(v1);
+        if(m.find()){
+            doc1Id = Integer.parseInt(m.group(3));
+        }
+
+        return doc0Id - doc1Id;
+    }
+}