You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/06/16 14:51:51 UTC

svn commit: r1748736 [2/5] - in /ctakes/trunk/ctakes-coreference: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/java...

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,525 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAttributeFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
+import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterMentionPairer_ImplBase;
+import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.HeadwordPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.SectionHeaderPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.SentenceDistancePairer;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+import org.cleartk.util.ViewUriUtil;
+
+public class MentionClusterCoreferenceAnnotator extends CleartkAnnotator<String> {
+  public static final String NO_RELATION_CATEGORY = "-NONE-";
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
+      "ProbabilityOfKeepingANegativeExample";
+  @ConfigurationParameter(
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected double probabilityOfKeepingANegativeExample = 0.5;
+
+  public static final String PARAM_USE_EXISTING_ENCODERS="UseExistingEncoders";
+  @ConfigurationParameter(name = PARAM_USE_EXISTING_ENCODERS,
+      mandatory=false,
+      description = "Whether to use encoders in output directory during data writing; if we are making multiple calls")
+  private boolean useExistingEncoders=false;
+      
+  protected Random coin = new Random(0);
+
+  boolean greedyFirst = true;
+  
+  private static DataWriter<String> classDataWriter = null;
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<String>> dataWriterClass,
+      File outputDirectory,
+      float downsamplingRate) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downsamplingRate,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(
+      String modelPath) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+
+  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+  private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+  private List<ClusterMentionPairer_ImplBase> pairExtractors = this.getPairExtractors();
+  
+//  private Set<String> markableStrings = null;
+  
+  protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
+    List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterStringFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterStackFeaturesExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+    extractors.add(new MentionClusterAttributeFeaturesExtractor());
+//    extractors.add(new MentionClusterAttributeVectorExtractor()); // does nothing yet
+    
+//    extractors.add(new MentionClusterDistanceFeaturesExtractor());
+    
+    try {
+//      extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+//      extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/deps.words"));
+      extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    
+    return extractors;
+  }
+  
+  protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+    List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+    // mention features from pairwise system:
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+
+//    try{
+//      extractors.add(new MentionClusterMentionFeaturesExtractor("org/apache/ctakes/coreference/distsem/ties1mil.lowercase.txt"));
+//    }catch(CleartkExtractorException e){
+//      e.printStackTrace();
+//    }
+    extractors.add(new MentionClusterAttributeFeaturesExtractor());
+
+    return extractors;
+  }
+  
+  protected List<ClusterMentionPairer_ImplBase> getPairExtractors(){
+    List<ClusterMentionPairer_ImplBase> pairers = new ArrayList<>();
+    int sentDist = 5;
+    pairers.add(new SentenceDistancePairer(sentDist));
+    pairers.add(new SectionHeaderPairer(sentDist));
+    pairers.add(new ClusterPairer(Integer.MAX_VALUE));
+    pairers.add(new HeadwordPairer());
+    return pairers;
+  }
+  
+  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas,
+      Markable mention){
+    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();   
+    for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
+      pairs.addAll(pairer.getPairs(jcas, mention));
+    }
+   
+    return pairs;
+  }
+  
+  private void resetPairers(JCas jcas){
+    for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
+      pairer.reset(jcas);
+    }
+  }
+   
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
+    
+    if(this.useExistingEncoders && classDataWriter != null){
+      this.dataWriter = classDataWriter;
+    }else if(this.isTraining()){
+      classDataWriter = this.dataWriter;
+    }
+  }
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    // lookup from pair of annotations to binary text relation
+    // note: assumes that there will be at most one relation per pair
+    this.resetPairers(jCas);
+    
+    Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
+    relationLookup = new HashMap<>();
+    if (this.isTraining()) {
+      for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
+        for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          CollectionTextRelationIdentifiedAnnotationRelation relation = 
+              new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+          relation.setCluster(cluster);
+          relation.setMention(mention);
+          relation.setCategory("CoreferenceClusterMember");
+          relation.addToIndexes();
+          // The key is a list of args so we can do bi-directional lookup
+          CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention);
+          if(relationLookup.containsKey(key)){
+            String cat = relationLookup.get(key).getCategory();
+            System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString());
+            System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText());
+          }
+          relationLookup.put(key, relation);
+        }
+      }
+    }
+
+    
+    for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+      for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+//        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+        boolean singleton = true;
+        double maxScore = 0.0;
+        CollectionTextRelation maxCluster = null;
+        
+        for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
+          CollectionTextRelation cluster = pair.getCluster();
+          // apply all the feature extractors to extract the list of features
+          List<Feature> features = new ArrayList<>();
+          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
+            List<Feature> feats = extractor.extract(jCas, cluster, mention);
+            if (feats != null){
+//              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
+              features.addAll(feats);
+            }
+          }
+                 
+          for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+            features.addAll(extractor.extract(jCas, mention));
+          }
+          
+          // here is where feature conjunctions can go (dupFeatures)
+          List<Feature> dupFeatures = new ArrayList<>();
+          // sanity check on feature values
+          for (Feature feature : features) {
+            if (feature.getValue() == null) {
+              feature.setValue("NULL");
+              String message = String.format("Null value found in %s from %s", feature, features);
+              System.err.println(message);
+              //            throw new IllegalArgumentException(String.format(message, feature, features));
+            }else{
+//              String prefix = null;
+              //  Durret and Klein style feature conjunctions: pronoun type or pos tag. maybe try umls semantic-type?
+              /*
+              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+                prefix = "PRO_"+mentionText;
+              }else if(headNode != null && headNode.getPostag() != null){
+                prefix = headNode.getPostag();                
+              }else{
+                prefix = "UNK";
+              }
+              */
+              // headword-based feature conjunctions
+/*              if(headNode != null && headNode.getCoveredText() != null && headMatches(headNode.getCoveredText().toLowerCase(), features)){
+                prefix = "HEAD_MATCH";
+              }else{
+                prefix = "NO_HEAD_MATCH";
+              }
+*/
+              
+              // UMLS semantic type feature conjunctions
+              /*
+              for(Feature feat : features){
+                if(feat.getName().startsWith("ClusterSemType")){
+                  dupFeatures.add(new Feature(feat.getName()+"_"+feature.getName(), feature.getValue()));
+                }
+              }
+              */
+              
+//              if(prefix != null){
+//                dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+//              }
+            }            
+          }
+          
+          features.addAll(dupFeatures);
+                   
+          // during training, feed the features to the data writer
+          if (this.isTraining()) {
+            String category = this.getRelationCategory(relationLookup, cluster, mention);
+            if (category == null) {
+              continue;
+            }
+
+            // create a classification instance and write it to the training data
+            this.dataWriter.write(new Instance<>(category, features));
+            if(!category.equals(NO_RELATION_CATEGORY)){
+              singleton = false;
+              break;
+            }
+          }
+
+          // during classification feed the features to the classifier and create
+          // annotations
+          else {
+            String predictedCategory = this.classify(features);
+            // TODO look at scores in classifier and try best-pair rather than first-pair?
+            Map<String,Double> scores = this.classifier.score(features);
+            
+            // add a relation annotation if a true relation was predicted
+            if (!predictedCategory.equals(NO_RELATION_CATEGORY)) {
+//              Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory)));
+              if(greedyFirst){
+                createRelation(jCas, cluster, mention, predictedCategory, scores.get(predictedCategory));
+                singleton = false;
+                // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
+                // for "best first" need to keep track of all relations with scores and only keep the highest
+                break;
+              }
+              if(scores.get(predictedCategory) > maxScore){
+            	  maxScore = scores.get(predictedCategory);
+            	  maxCluster = cluster;
+              }
+            }
+          }
+        }
+        if(!this.isTraining() && !greedyFirst && maxCluster != null){
+          // make a link with the max cluster
+          createRelation(jCas, maxCluster, mention, "CoreferenceClusterMember", maxScore);
+        }
+                       
+        // if we got this far and never matched up the markable then add it to list.
+        // do this even during training -- adds non-chain markables to antecedent list which will be seen during testing.
+        if(singleton){
+          // make the markable it's own cluster:
+          CollectionTextRelation chain = new CollectionTextRelation(jCas);
+          NonEmptyFSList list = new NonEmptyFSList(jCas);
+          list.setHead(mention);
+          list.setTail(new EmptyFSList(jCas));
+          chain.setMembers(list);
+          chain.addToIndexes();
+          list.addToIndexes();
+          list.getTail().addToIndexes();
+        }
+      }
+    }
+    
+    removeSingletonClusters(jCas);
+  }
+  
+ 
+  /**
+   * Looks up the arguments in the specified lookup table and converts the
+   * relation into a label for classification
+   * 
+   * @return If this category should not be processed for training return
+   *         <i>null</i> otherwise it returns the label sent to the datawriter
+   */
+  protected String getRelationCategory(
+      Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) {
+    CollectionTextRelationIdentifiedAnnotationRelation relation = 
+        relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+    String category;
+    if (relation != null) {
+      category = relation.getCategory();
+    } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
+      category = NO_RELATION_CATEGORY;
+    } else {
+      category = null;
+    }
+    return category;
+  }
+
+  /**
+   * Predict an outcome given a set of features. By default, this simply
+   * delegates to the object's <code>classifier</code>. Subclasses may override
+   * this method to implement more complex classification procedures.
+   * 
+   * @param features
+   *          The features to be classified.
+   * @return The predicted outcome (label) for the features.
+   */
+  protected String classify(List<Feature> features) throws CleartkProcessingException {
+    return this.classifier.classify(features);
+  }
+
+  /**
+   * Create a UIMA relation type based on arguments and the relation label. This
+   * allows subclasses to create/define their own types: e.g. coreference can
+   * create CoreferenceRelation instead of BinaryTextRelation
+   * 
+   * @param jCas
+   *          - JCas object, needed to create new UIMA types
+   * @param arg1
+   *          - First argument to relation
+   * @param arg2
+   *          - Second argument to relation
+   * @param predictedCategory
+   *          - Name of relation
+   */
+  protected void createRelation(
+      JCas jCas,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention,
+      String predictedCategory,
+      Double confidence) {
+    // add the relation to the CAS
+    CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+    relation.setCluster(cluster);
+    relation.setMention(mention);
+    relation.setCategory(predictedCategory);
+    relation.setConfidence(confidence);
+    relation.addToIndexes();
+    
+//    RelationArgument arg = new RelationArgument(jCas);
+//    arg.setArgument(mention);
+    ListFactory.append(jCas, cluster.getMembers(), mention);    
+  }
+
+
+  private static void removeSingletonClusters(JCas jcas){
+    List<CollectionTextRelation> toRemove = new ArrayList<>();
+    for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){     
+      NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+      if(head.getTail() instanceof EmptyFSList){
+        toRemove.add(rel);
+      }
+    }
+    
+    for(CollectionTextRelation rel : toRemove){
+      rel.removeFromIndexes();
+    }
+  }
+  
+//  private static final boolean dominates(Annotation arg1, Annotation arg2) {
+//    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+//  }
+
+  /*
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+  }
+  */
+  
+  public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
+    Map<HashableArguments, Double> scoreMap = new HashMap<>();
+    for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){
+      HashableArguments pair = new HashableArguments(reln.getArg1().getArgument(), reln.getArg2().getArgument());
+      scoreMap.put(pair, reln.getConfidence());
+    }
+    return scoreMap;
+  }
+  
+  public static class CollectionTextRelationIdentifiedAnnotationPair {
+    private final CollectionTextRelation cluster;
+    private final IdentifiedAnnotation mention;
+    
+    public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){
+      this.cluster = cluster;
+      this.mention = mention;
+    }
+    
+    public final CollectionTextRelation getCluster(){
+      return this.cluster;
+    }
+    
+    public final IdentifiedAnnotation getMention(){
+      return this.mention;
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+      CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj;
+      return (this.cluster == other.cluster &&
+          this.mention == other.mention);
+    }
+    
+    @Override
+    public int hashCode() {
+      return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode());
+    }
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,688 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAttributeFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+import org.cleartk.ml.svmlight.rank.QidInstance;
+import org.cleartk.util.ViewUriUtil;
+
+public class MentionClusterRankingCoreferenceAnnotator extends CleartkAnnotator<Double> {
+  public static final String NO_RELATION_CATEGORY = "-NONE-";
+  public static final String CLUSTER_RELATION_CATEGORY = "CoreferenceClusterMember";
+  
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
+      "ProbabilityOfKeepingANegativeExample";
+  @ConfigurationParameter(
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected double probabilityOfKeepingANegativeExample = 0.5;
+
+  protected Random coin = new Random(0);
+
+  boolean greedyFirst = true;
+  
+  private int qid = 0;
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<?>> dataWriterClass,
+      File outputDirectory,
+      float downsamplingRate) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterRankingCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        MentionClusterRankingCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downsamplingRate,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(
+      String modelPath) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterRankingCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+
+  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+  private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+  
+  private Set<String> markableStrings = null;
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+  private Map<String,Set<Markable>> headWordMarkables = null;
+  private Map<HashableArguments,Double> pairScores = null;
+  
+  protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
+    List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterStringFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterStackFeaturesExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+//    extractors.add(new MentionClusterDistanceFeaturesExtractor());
+    extractors.add(new MentionClusterAttributeFeaturesExtractor());
+    
+    try {
+      extractors.add(new MentionClusterDistSemExtractor());
+      extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    
+    return extractors;
+  }
+  
+  protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+    List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+    // mention features from pairwise system:
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+
+    try {
+      extractors.add(new MentionClusterMentionFeaturesExtractor());
+    } catch (CleartkExtractorException e) {
+      e.printStackTrace();
+    }
+    extractors.add(new MentionClusterAttributeFeaturesExtractor());
+
+    return extractors;
+  }
+  
+  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas,
+      IdentifiedAnnotation mention){
+    int sentDist = 5;
+    // using linked hash set ensures no duplicates:
+    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
+    pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
+    pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
+    pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
+    pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
+    
+    return pairs;
+  }
+  
+  /*
+   * getExactStringMatchPairs()
+   * For mentions that have the exact string repeated elsewhere in the document we want to
+   * allow matching across any distance. We don't use the sentence distance parameter here.
+   * We make use of a global variable markableStrings that is a HashSet containig all the markable
+   * strings from this document.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getExactStringMatchPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          // see if any of the members of the cluster have the exact same string as this 
+          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+  
+  /*
+   * getClusterPairs()
+   * In this method we allow to link to clusters containing more than one mention even if they
+   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+   * is within the specified sentence distance (presumably longer than the sentence distance passed into
+   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+   * members but only one before the focus mention. So we need to count the members of a cluster until we 
+   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getClusterPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+        continue;
+      }
+      int numMembers=0;
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        numMembers++;
+        if(m == mostRecent) break;
+      }
+      if(numMembers > 1){
+        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      }
+    }
+    
+    return pairs;
+  }
+
+  /*
+   * Here we want to add only things that are nearby. First we check the semantic types
+   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+   * we add the cluster no matter what. Otherwise we check how many sentences are in between
+   * the mention and the latest element of the cluster.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+    
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()) continue;
+      
+      // check for distance if they are not anatomical site or medication
+      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+
+        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue;
+      }
+
+      // check for types of cluster
+      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+        boolean overlap = false;
+        for(String semType : bestAnaTypes){
+          if(bestClusterTypes.contains(semType)){
+            overlap = true;
+          }
+        }
+        // they both correspond to named entities but no overlap in which category of named entity.
+        if(!overlap){
+          continue;
+        }
+      }
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
+    }
+    return pairs;
+  }
+
+  /*
+   * getSectionHeaderPairs()
+   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
+   * span only contains one sentence then we consider it a "header" (or also as important a list item).
+   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+   * the "sentence distance" method.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+        continue;
+      }
+      
+      // now check if any of the mentions are in a section header
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        boolean match = false;
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          // this is sentences that are the same span as paragraphs -- how we model section headers
+          // see if any of the cluster mentions are in the section header
+          for(Markable m : JCasUtil.select(members, Markable.class)){
+            if(dominates(par, m)){
+              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+              match = true;
+              break;
+            }
+          }
+        }
+        if(match) break;
+      }
+    }
+    return pairs;
+  }
+  
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    if(headNode == null){
+      Logger.getLogger(MentionClusterRankingCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+      return pairs;
+    }
+    String head = headNode.getCoveredText().toLowerCase();
+    if(headWordMarkables.containsKey(head)){
+      Set<Markable> headSet = headWordMarkables.get(head);
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(headSet.contains(mostRecent)){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+          if(m == mostRecent) break;
+        }
+      }      
+    }
+    
+    return pairs;
+  }
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    // lookup from pair of annotations to binary text relation
+    // note: assumes that there will be at most one relation per pair
+    markableStrings = new HashSet<>();
+    nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    headWordMarkables = new HashMap<>();
+//    pairScores = getMarkablePairScores(jCas);
+    
+    Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
+    relationLookup = new HashMap<>();
+    if (this.isTraining()) {
+      for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
+        for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          CollectionTextRelationIdentifiedAnnotationRelation relation = 
+              new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+          relation.setCluster(cluster);
+          relation.setMention(mention);
+          relation.setCategory("CoreferenceClusterMember");
+          relation.addToIndexes();
+          // The key is a list of args so we can do bi-directional lookup
+          CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention);
+          if(relationLookup.containsKey(key)){
+            String cat = relationLookup.get(key).getCategory();
+            System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString());
+            System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText());
+          }
+          relationLookup.put(key, relation);
+        }
+      }
+    }
+
+    
+    for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+      for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+        String mentionText = mention.getCoveredText().toLowerCase();
+        boolean singleton = true;
+        double maxScore = Double.NEGATIVE_INFINITY;
+        CollectionTextRelation maxCluster = null;
+        List<Feature> mentionFeatures = new ArrayList<>();
+        for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+          mentionFeatures.addAll(extractor.extract(jCas, mention));
+        }
+
+        for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
+          CollectionTextRelation cluster = pair.getCluster();
+          // apply all the feature extractors to extract the list of features
+          List<Feature> features = new ArrayList<>();
+          features.addAll(mentionFeatures);
+          
+          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
+            List<Feature> feats = extractor.extract(jCas, cluster, mention);
+            if (feats != null){
+//              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
+              features.addAll(feats);
+            }
+          }
+          
+          
+          // here is where feature conjunctions can go (dupFeatures)
+          List<Feature> dupFeatures = new ArrayList<>();
+          // sanity check on feature values
+          for (Feature feature : features) {
+            if (feature.getValue() == null) {
+              feature.setValue("NULL");
+              String message = String.format("Null value found in %s from %s", feature, features);
+              System.err.println(message);
+              //            throw new IllegalArgumentException(String.format(message, feature, features));
+            }else{
+              String prefix = null;
+//              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+//                prefix = "PRO_"+mentionText;
+//              }else if(headNode != null && headNode.getPostag() != null){
+//                prefix = headNode.getPostag();                
+//              }else{
+//                prefix = "UNK";
+//              }
+              if(prefix != null){
+                dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+              }
+            }
+          }
+          features.addAll(dupFeatures);    
+
+          // during training, feed the features to the data writer
+          // create a classification instance and write it to the training data
+
+          if (this.isTraining()) {
+            String category = this.getRelationCategory(relationLookup, cluster, mention);
+            if (category == null) {
+              continue;
+            }
+            double outVal = 1.0;
+            if(category.equals(NO_RELATION_CATEGORY)){
+              outVal = 0.0;
+            }
+
+            QidInstance<Double> inst = new QidInstance<>();
+            inst.setQid(String.valueOf(qid));
+            inst.addAll(features);
+            inst.setOutcome(outVal);
+            this.dataWriter.write(inst);
+            if(!category.equals(NO_RELATION_CATEGORY)){
+              singleton = false;
+              break;
+            }
+          }
+
+          // during classification feed the features to the classifier and create
+          // annotations
+          else {
+            Double prediction = this.classify(features);
+            if(prediction > maxScore){
+              maxScore = prediction;
+              maxCluster = cluster;
+            }
+          }
+        }
+        
+        markableStrings.add(mention.getCoveredText().toLowerCase());
+        
+        if(headNode != null){
+          String head = headNode.getCoveredText().toLowerCase();
+          if(!headWordMarkables.containsKey(head)){
+            headWordMarkables.put(head, new HashSet<Markable>());
+          }
+          headWordMarkables.get(head).add(mention);
+        }
+        
+        if(this.isTraining()){
+          // write a dummy link with only mention features:
+          QidInstance<Double> inst = new QidInstance<>();
+          inst.setQid(String.valueOf(qid));
+          for(Feature feat : mentionFeatures){
+            if(feat.getName() != null){
+              feat.setName("DUMMYLINK_" + feat.getName());
+            }
+          }
+          inst.addAll(mentionFeatures);
+          if(singleton){
+            inst.setOutcome(1.0);
+          }else{
+            inst.setOutcome(0.0);
+          }
+          this.dataWriter.write(inst);
+        }else{
+          Double nullPrediction = this.classify(mentionFeatures);
+          if(nullPrediction > maxScore){
+            // make the markable it's own cluster:
+            CollectionTextRelation chain = new CollectionTextRelation(jCas);
+            NonEmptyFSList list = new NonEmptyFSList(jCas);
+            list.setHead(mention);
+            list.setTail(new EmptyFSList(jCas));
+            chain.setMembers(list);
+            chain.addToIndexes();
+            list.addToIndexes();
+            list.getTail().addToIndexes();
+          }else{
+            createRelation(jCas, maxCluster, mention, CLUSTER_RELATION_CATEGORY);
+          }
+        }
+        qid++;
+      }
+    }
+    
+    removeSingletonClusters(jCas);
+  }
+  
+  /**
+   * Looks up the arguments in the specified lookup table and converts the
+   * relation into a label for classification
+   * 
+   * @return If this category should not be processed for training return
+   *         <i>null</i> otherwise it returns the label sent to the datawriter
+   */
+  protected String getRelationCategory(
+      Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) {
+    CollectionTextRelationIdentifiedAnnotationRelation relation = 
+        relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+    String category;
+    if (relation != null) {
+      category = relation.getCategory();
+    } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
+      category = NO_RELATION_CATEGORY;
+    } else {
+      category = null;
+    }
+    return category;
+  }
+
+  /**
+   * Predict an outcome given a set of features. By default, this simply
+   * delegates to the object's <code>classifier</code>. Subclasses may override
+   * this method to implement more complex classification procedures.
+   * 
+   * @param features
+   *          The features to be classified.
+   * @return The predicted outcome (label) for the features.
+   */
+  protected Double classify(List<Feature> features) throws CleartkProcessingException {
+    return this.classifier.classify(features);
+  }
+
+  /**
+   * Create a UIMA relation type based on arguments and the relation label. This
+   * allows subclasses to create/define their own types: e.g. coreference can
+   * create CoreferenceRelation instead of BinaryTextRelation
+   * 
+   * @param jCas
+   *          - JCas object, needed to create new UIMA types
+   * @param arg1
+   *          - First argument to relation
+   * @param arg2
+   *          - Second argument to relation
+   * @param predictedCategory
+   *          - Name of relation
+   */
+  protected void createRelation(
+      JCas jCas,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention,
+      String predictedCategory) {
+    // add the relation to the CAS
+    CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+    relation.setCluster(cluster);
+    relation.setMention(mention);
+    relation.setCategory(predictedCategory);
+    relation.addToIndexes();
+    
+//    RelationArgument arg = new RelationArgument(jCas);
+//    arg.setArgument(mention);
+    ListFactory.append(jCas, cluster.getMembers(), mention);    
+  }
+
+
+  private void removeSingletonClusters(JCas jcas){
+    List<CollectionTextRelation> toRemove = new ArrayList<>();
+    for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){     
+      NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+      if(head.getTail() instanceof EmptyFSList){
+        toRemove.add(rel);
+      }
+    }
+    
+    for(CollectionTextRelation rel : toRemove){
+      rel.removeFromIndexes();
+    }
+  }
+  
+  private static final boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+  }
+  
+  
+  public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
+    Map<HashableArguments, Double> scoreMap = new HashMap<>();
+    for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){
+      HashableArguments pair = new HashableArguments((IdentifiedAnnotation)reln.getArg1().getArgument(), (IdentifiedAnnotation)reln.getArg2().getArgument());
+      scoreMap.put(pair, reln.getConfidence());
+    }
+    return scoreMap;
+  }
+  
+  public static class CollectionTextRelationIdentifiedAnnotationPair {
+    private final CollectionTextRelation cluster;
+    private final IdentifiedAnnotation mention;
+    
+    public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){
+      this.cluster = cluster;
+      this.mention = mention;
+    }
+    
+    public final CollectionTextRelation getCluster(){
+      return this.cluster;
+    }
+    
+    public final IdentifiedAnnotation getMention(){
+      return this.mention;
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+      CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj;
+      return (this.cluster == other.cluster &&
+          this.mention == other.mention);
+    }
+    
+    @Override
+    public int hashCode() {
+      return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode());
+    }
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,177 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class PersonChainAnnotator extends JCasAnnotator_ImplBase {
+
+  @Override
+  public void process(JCas jcas) throws AnalysisEngineProcessException {
+    NonEmptyFSList ptList = new NonEmptyFSList(jcas);
+    ptList.setHead(null);
+    NonEmptyFSList weList = new NonEmptyFSList(jcas);
+    weList.setHead(null);
+    NonEmptyFSList drList = new NonEmptyFSList(jcas);
+    drList.setHead(null);
+    List<NonEmptyFSList> otherDrs = new ArrayList<>();
+    
+    List<WordToken> words = new ArrayList<>(JCasUtil.select(jcas, WordToken.class));
+    for(int i = 0; i < words.size(); i++){
+      WordToken word = words.get(i);
+      String text = word.getCoveredText();
+      if(word.getPartOfSpeech().startsWith("PRP")){
+        if(text.equalsIgnoreCase("I") || text.equalsIgnoreCase("me") || text.equalsIgnoreCase("my")){
+          Markable drMention = new Markable(jcas, word.getBegin(), word.getEnd());
+          addToList(jcas, drList, drMention);
+        }else if(text.equalsIgnoreCase("we") || text.equalsIgnoreCase("us") || text.equalsIgnoreCase("our")){
+          Markable weMention = new Markable(jcas, word.getBegin(), word.getEnd());
+          addToList(jcas, weList, weMention);
+        }else if(text.equalsIgnoreCase("it")){
+          // do nothing
+        }else{
+          Markable ptMention = new Markable(jcas, word.getBegin(), word.getEnd());
+          addToList(jcas, ptList, ptMention);
+        }
+      }else if(text.equalsIgnoreCase("dr.")){      
+        Markable drMention = getDoctorMarkable(jcas, word); //new Markable(jcas, word.getBegin(), words.get(i+1).getEnd());
+        addToList(jcas, getCorrectDoctor(jcas, drMention, otherDrs), drMention);
+      }else if(text.equalsIgnoreCase("mrs.") || text.equalsIgnoreCase("mr.") || text.equalsIgnoreCase("ms.")){
+        // TODO - smarter logic for Dr. Firstname Lastname
+        Markable ptMention = new Markable(jcas, word.getBegin(), words.get(i+1).getEnd());
+        addToList(jcas, ptList, ptMention);
+      }else if(text.equalsIgnoreCase("patient") || text.equalsIgnoreCase("pt")){
+        Markable ptMention = new Markable(jcas, word.getBegin(), word.getEnd());
+        addToList(jcas, ptList, ptMention);
+      }
+    }
+    
+    for(NonEmptyFSList otherDr : otherDrs){
+      if(otherDr.getHead() != null){
+        if(otherDr.getTail() != null){
+          endList(jcas, otherDr);
+          CollectionTextRelation drChain = new CollectionTextRelation(jcas);
+          drChain.setMembers(otherDr);
+          drChain.addToIndexes();
+        }
+      }
+    }
+    
+    if(drList.getHead() != null && drList.getTail() != null){
+      endList(jcas, drList);
+      CollectionTextRelation drChain = new CollectionTextRelation(jcas);
+      drChain.setMembers(drList);
+      drChain.addToIndexes();
+    }
+    if(ptList.getHead() != null && ptList.getTail() != null){
+      endList(jcas, ptList);
+      CollectionTextRelation ptChain = new CollectionTextRelation(jcas);
+      ptChain.setMembers(ptList);
+      ptChain.addToIndexes();
+    }
+    if(weList.getHead() != null && weList.getTail() != null){
+      endList(jcas, weList);
+      CollectionTextRelation weChain = new CollectionTextRelation(jcas);
+      weChain.setMembers(weList);
+      weChain.addToIndexes();
+    }
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(PersonChainAnnotator.class);
+  }
+
+  private static void addToList(JCas jcas, NonEmptyFSList list, Markable arg){
+    arg.addToIndexes();
+    if(list.getHead() == null){
+      // first list element:
+      list.setHead(arg);
+    }else{
+      // subsequent list elements:
+      NonEmptyFSList cur = list;
+      while(cur.getTail() != null){
+        cur = (NonEmptyFSList)cur.getTail();
+      }
+      NonEmptyFSList tail = new NonEmptyFSList(jcas);
+      tail.setHead(arg);
+      cur.setTail(tail);
+      tail.addToIndexes();
+    }
+  }
+  
+  private static void endList(JCas jcas, NonEmptyFSList list){
+    NonEmptyFSList cur = list;
+    while(cur.getTail() != null){
+      cur = (NonEmptyFSList)cur.getTail();
+    }
+    EmptyFSList tail = new EmptyFSList(jcas);
+    cur.setTail(tail);
+    tail.addToIndexes();
+  }
+  
+  private static NonEmptyFSList getCorrectDoctor(JCas jcas, Markable mention, List<NonEmptyFSList> drLists){
+    NonEmptyFSList correctDr = null;
+    if(mention.getCoveredText().length() < 5){
+      if(drLists.size() > 0){
+        correctDr = drLists.get(0);
+      }
+    }else{
+      String nameText = mention.getCoveredText().substring(4);
+      for(NonEmptyFSList drList : drLists){
+        FSList curNode = drList;
+        do{
+          String otherName = ((Markable)((NonEmptyFSList)curNode).getHead()).getCoveredText();
+          if(otherName.length() >= 5){
+            otherName = otherName.substring(4);
+            if(otherName.contains(nameText) || nameText.contains(otherName)){
+              correctDr = drList;
+            }
+          }
+          curNode = ((NonEmptyFSList)curNode).getTail();
+        }while(curNode instanceof NonEmptyFSList);
+        if(correctDr != null) break;
+      }
+    }
+    if(correctDr == null){
+      correctDr = new NonEmptyFSList(jcas);
+      correctDr.setHead(null);
+      drLists.add(correctDr);
+    }
+    return correctDr;
+  }
+  
+  private static Markable getDoctorMarkable(JCas jcas, WordToken drToken){
+    Markable markable = null;
+    
+    ConllDependencyNode nnpHead = DependencyUtility.getDependencyNode(jcas, drToken);
+    try{
+    	while(nnpHead != null && nnpHead.getHead() != null && nnpHead.getHead().getId() != 0 && nnpHead.getHead().getPostag().equals("NNP")){
+    		nnpHead = nnpHead.getHead();
+    	}
+    }catch(NullPointerException e){
+      System.err.print(".");
+    }
+    
+    int start = drToken.getBegin();
+    int end = nnpHead.getEnd();
+    if(end < start) end = drToken.getEnd();
+    
+    markable = new Markable(jcas, start, end);    
+    return markable;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,65 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isGeneric;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isHistory;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isNegated;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isPatient;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isUncertain;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class AttributeFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana)
+      throws AnalysisEngineProcessException {
+    List<Feature> features = new ArrayList<>();
+    
+    boolean anaNegated = isNegated(ana);
+    features.add(new Feature("MC_ana_NEGATED", anaNegated));
+    boolean anaUncertain = isUncertain(ana);
+    features.add(new Feature("MC_ana_UNCERTAIN", anaUncertain));
+    boolean anaGen = isGeneric(ana);
+    features.add(new Feature("MC_ana_GENERIC", anaGen));
+    boolean anaSubj = isPatient(ana);
+    features.add(new Feature("MC_ana_PATIENT", anaSubj));
+    boolean anaHist = isHistory(ana);
+    features.add(new Feature("MC_ana_HISTORY", anaHist));
+    boolean anaTimex = isTimex(ana);
+    features.add(new Feature("MC_ana_TIMEX", anaTimex));
+    
+    boolean anteNegated = isNegated(ante);
+    features.add(new Feature("MC_ante_NEGATED", anteNegated));
+    boolean anteUncertain = isUncertain(ante);
+    features.add(new Feature("MC_ante_UNCERTAIN", anteUncertain));
+    boolean anteGen = isGeneric(ante);
+    features.add(new Feature("MC_ante_GENERIC", anteGen));
+    boolean anteSubj = isPatient(ante);
+    features.add(new Feature("MC_ante_PATIENT", anteSubj));
+    boolean anteHist = isHistory(ante);
+    features.add(new Feature("MC_ante_HISTORY", anteHist));
+    boolean anteTimex = isTimex(ante);
+    features.add(new Feature("MC_ante_TIMEX", anteTimex));
+    
+    features.add(new Feature("MC_AGREE_NEG", anteNegated == anaNegated));
+    features.add(new Feature("MC_AGREE_UNC", anteUncertain == anaUncertain));    
+    features.add(new Feature("MC_AGREE_TIMEX", anteTimex == anaTimex));
+
+    return features;
+  }
+  
+  private static boolean isTimex(Annotation a){
+    return JCasUtil.selectCovered(TimeMention.class, a).size() > 0;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,32 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class CorefSyntaxFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+    ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+    
+    if(head1 != null){
+      feats.add(new Feature("Arg1Head", head1.getCoveredText().toLowerCase()));
+    }
+    if(head2 != null){
+      feats.add(new Feature("Arg2Head", head2.getCoveredText().toLowerCase()));
+    }
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,106 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.apache.uima.fit.util.JCasUtil;
+
+public class DistSemFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  // default value is 0.5 (rather than 0.0) because we don't want to assume OOV words are dissimilar
+  public static final double DEFAULT_SIM = 0.5;  
+  
+  private WordEmbeddings words = null;
+  
+  public DistSemFeatureExtractor() throws FileNotFoundException, IOException{
+    words = WordVectorReader.getEmbeddings(FileLocator.getAsStream("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+  }
+  
+  @Override
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    double sim = 0.0;
+//    double[] a1vec = getArgVector(arg1);
+//    double[] a2vec = getArgVector(arg2);
+//    
+//    if(a1vec != null && a2vec != null){
+//      for(int i = 0; i < a1vec.length; i++){
+//        sim += a1vec[i] * a2vec[i];
+//      }
+//    }else{
+//      sim = DEFAULT_SIM;
+//    }
+//    
+//    assert !Double.isNaN(sim);
+//    
+//    feats.add(new Feature("ARG_SIMILARITY_WORD2VEC", sim));
+    
+    ConllDependencyNode node1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+    ConllDependencyNode node2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+    String head1 = node1 != null ? node1.getCoveredText().toLowerCase() : null;
+    String head2 = node2 != null ? node2.getCoveredText().toLowerCase() : null;
+    if(head1 != null && head2 != null && words.containsKey(head1) && words.containsKey(head2)){
+      sim = words.getSimilarity(head1, head2);
+    }else{
+      sim = DEFAULT_SIM;
+    }
+    feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", sim));
+    
+    return feats;
+  }
+
+  
+  @SuppressWarnings("unused")
+  private double[] getArgVector(IdentifiedAnnotation arg){
+    double[] vec = null;
+    
+    Collection<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class, arg);
+    
+    for(BaseToken token : tokens){
+      WordVector wv = words.getVector(token.getCoveredText());
+      if(wv == null){
+        wv = words.getVector(token.getCoveredText().toLowerCase());
+      }
+      if(wv != null){
+        if(vec == null){
+          vec = new double[wv.size()];
+          Arrays.fill(vec, 0.0);
+        }
+        for(int i = 0; i < vec.length; i++){
+          vec[i] += wv.getValue(i);
+        }
+      }
+    }
+    
+    if(vec != null){
+      double len = 0.0;
+      for(int i = 0; i < vec.length; i++){
+        len += vec[i]*vec[i];
+      }
+      len = Math.sqrt(len);
+      assert !Double.isNaN(len);
+      for(int i = 0; i < vec.length; i++){
+        vec[i] /= len;
+      }
+    }
+    return vec;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,29 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.util.CorefConsts;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.apache.uima.fit.util.JCasUtil;
+
+public class DistanceFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<>();
+		feats.add(new Feature("TOK_DIST",
+				  JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size() / (double)CorefConsts.TOKDIST));
+		feats.add(new Feature("SENT_DIST",
+				JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() / (double) CorefConsts.NEDIST));
+		return feats;
+	}
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,24 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class SalienceFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana)
+      throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    feats.add(new Feature("MP_ANTE_SALIENCE", ante.getConfidence()));
+    feats.add(new Feature("MP_ANA_SALIENCE", ana.getConfidence()));
+    return feats;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,56 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class SectionFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jcas, IdentifiedAnnotation ante,
+      IdentifiedAnnotation ana) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    boolean anteInHeader = false;
+    boolean anaInHeader = false;
+    int antePar = -1;
+    int anaPar = -1;
+    
+    // Find section headers -- paragraphs 
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    for(int i = 0; i < pars.size(); i++){
+      Paragraph par = pars.get(i);
+      if(par.getBegin() > ana.getEnd()){
+        break;
+      }
+      if(ante.getBegin() >= par.getBegin() && ante.getEnd() <= par.getEnd()){
+        antePar = i;
+      }
+      if(ana.getBegin() >= par.getBegin() && ana.getEnd() <= par.getEnd()){
+        anaPar = i;
+      }
+      List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+      if(coveredSents != null && coveredSents.size() == 1){
+        if(antePar == i){
+          anteInHeader = true;
+        }
+        if(anaPar == i){
+          anaInHeader = true;
+        }
+      }
+    }
+
+    feats.add(new Feature("AnteInHeader", anteInHeader));
+    feats.add(new Feature("AnaInHeader", anaInHeader));
+    if(anteInHeader && antePar+1 == anaPar){
+      feats.add(new Feature("AnteHeaderHeadsAna", true));      
+    }
+    return feats;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,141 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class StringMatchingFeatureExtractor implements
+		RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+		List<Feature> feats = new ArrayList<>();
+		
+		// don't extract sim features if one of the markables is a pronoun
+		if(isPronoun(arg1) || isPronoun(arg2)) return feats;
+		
+		String s1 = arg1.getCoveredText();
+		String s2 = arg2.getCoveredText();
+		Set<String> words1 = contentWords(arg1);
+		Set<String> words2 = contentWords(arg2);
+		
+		feats.add(new Feature("MATCH_EXACT",
+				s1.equalsIgnoreCase(s2)));
+		feats.add(new Feature("MATCH_START",
+				startMatch(s1,s2)));
+		feats.add(new Feature("MATCH_END",
+				endMatch(s1,s2)));
+		feats.add(new Feature("MATCH_SOON",
+				soonMatch(s1,s2)));
+		feats.add(new Feature("MATCH_OVERLAP",
+				wordOverlap(words1, words2)));
+		feats.add(new Feature("MATCH_SUBSTRING",
+				wordSubstring(words1, words2)));
+		return feats;
+	}
+
+	public static boolean startMatch (String a, String b) {
+		int ia = a.indexOf(" ");
+		int ib = b.indexOf(" ");
+		String aa = a.substring(0, ia==-1?(a.length()>5?5:a.length()):ia);
+		String bb = b.substring(0, ib==-1?(b.length()>5?5:b.length()):ib);
+		return aa.equalsIgnoreCase(bb);
+	}
+
+	public static boolean endMatch (String a, String b) {
+		int ia = a.lastIndexOf(" ");
+		int ib = b.lastIndexOf(" ");
+		String aa = a.substring(ia==-1?(a.length()>5?a.length()-5:0):ia+1);
+		String bb = b.substring(ib==-1?(b.length()>5?b.length()-5:0):ib+1);
+		return aa.equalsIgnoreCase(bb);
+	}
+
+	public static boolean soonMatch (String s1, String s2) {
+		String sl1 = nonDetSubstr(s1.toLowerCase());
+		String sl2 = nonDetSubstr(s2.toLowerCase());
+		return sl1.equals(sl2);
+	}
+
+	public static String nonDetSubstr (String s) {
+		if(s.startsWith("the ")) return s.substring(4);
+		if(s.startsWith("a ")) return s.substring(2);
+		if(s.startsWith("this ")) return s.substring(5);
+		if(s.startsWith("that ")) return s.substring(5);
+		if(s.startsWith("these ")) return s.substring(6);
+		if(s.startsWith("those ")) return s.substring(6);
+		return s;
+	}
+
+	public static boolean wordOverlap(Set<String> t1, Set<String> t2) {
+		for (String s : t2){
+			if (t1.contains(s)){
+				return true;
+			}
+		}
+		return false;
+	}
+
+	public static boolean wordSubstring(Set<String> t1, Set<String> t2){
+	  for(String s1 : t1){
+	    for(String s2 : t2){
+	      if(s1.contains(s2) || s2.contains(s1)) return true;
+	    }
+	  }
+	  return false;
+	}
+	
+	public static Set<String> contentWords(Annotation a1){
+		Set<String> words = new HashSet<>();
+		for(BaseToken tok : JCasUtil.selectCovered(BaseToken.class, a1)){
+			words.add(tok.getCoveredText().toLowerCase());
+		}
+		return words;
+	}
+	
+	public static boolean isPronoun(IdentifiedAnnotation a1){
+	  List<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class, a1);
+	  
+	  if(tokens.size() != 1){
+	    return false;
+	  }
+	  
+	  BaseToken token = tokens.get(0);
+	  if(token.getPartOfSpeech() == null){
+	    return false;
+	  }
+	  if(token.getPartOfSpeech().startsWith("PRP")) return true;
+	  if(token.getPartOfSpeech().equals("DT")) return true;
+	  
+	  
+	  return false;
+	}
+	
+	public static boolean inQuote(JCas jcas, Annotation a){
+	  boolean inQuote = false;
+	  String docText = jcas.getDocumentText();
+	  
+	  // Logic: Find the newline preceding this mention, if there is a quote in between
+	  // the start of the line and the start of the mention then the mention is inside quotes.
+	  // not foolproof but probably pretty accurate.
+	  int lastNewline = docText.lastIndexOf("\n", a.getBegin());
+	  if(lastNewline != 0){
+	    int firstQuote = docText.indexOf('"', lastNewline);
+	    if(firstQuote != 0){
+	      inQuote = true;
+	    }
+	  }
+	  
+	  return inQuote;
+	}
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,54 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class TemporalFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation> {
+
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+      IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    String a1dtr = getDocTimeRelForArg(jCas, arg1);
+    String a2dtr = getDocTimeRelForArg(jCas, arg2);
+
+    feats.add(new Feature("Arg1DTR_" + a1dtr, true));
+    feats.add(new Feature("Arg2DTR_" + a2dtr, true));
+    
+    if(a1dtr.equals(a2dtr)){
+      if(!a1dtr.equals("NA")){
+        feats.add(new Feature("DTR_Match", true));
+      }
+    }
+    
+    return feats;
+  }
+
+  private static String getDocTimeRelForArg(JCas jCas, IdentifiedAnnotation arg){
+    String dtr = "NA";
+    
+    // find EventMentions and grab their event properties
+    ConllDependencyNode node = DependencyUtility.getNominalHeadNode(jCas, arg);
+    if(node != null){
+      List<EventMention> events = JCasUtil.selectCovered(jCas, EventMention.class, node);
+      for(EventMention event : events){
+        if(event.getClass().getSimpleName().equals("EventMention")){
+          if(event.getEvent() != null && event.getEvent().getProperties() != null && event.getEvent().getProperties().getDocTimeRel() != null){
+            dtr = event.getEvent().getProperties().getDocTimeRel();
+          }
+        }
+      }
+    }
+    return dtr;
+  }
+}
\ No newline at end of file