You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2015/11/09 15:48:12 UTC
svn commit: r1713449 - in
/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference:
ae/ ae/features/ ae/features/cluster/ eval/
Author: tmill
Date: Mon Nov 9 14:48:12 2015
New Revision: 1713449
URL: http://svn.apache.org/viewvc?rev=1713449&view=rev
Log:
Variety of minor changes ; synching with remotes.
Added:
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
Removed:
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSpans.java
Modified:
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java Mon Nov 9 14:48:12 2015
@@ -8,6 +8,7 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.List;
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
@@ -124,18 +125,21 @@ public class CoreferenceChainScoringOutp
Multiset<Integer> endSet = HashMultiset.create();
int tokenId = 0;
int sentId = 0;
+ BaseToken nextToken = tokens.get(0);
for(int i = 0; i < tokens.size(); i++){
- BaseToken token = tokens.get(i);
+ boolean endSentToken = false;
+ BaseToken token = nextToken;
+ if(i+1 < tokens.size()){
+ nextToken = tokens.get(i+1);
+ if(nextToken instanceof NewlineToken || (token.getCoveredText().equals(".") && !(endSet.size() > 0))){
+ endSentToken = true;
+ }
+ }
// if we see a newline token at the end of a sentence break the sentence
// only print out if we are not at the start of the sentence:
if(token instanceof NewlineToken){
- if(tokenId > 0){
- out.println();
- tokenId = 0;
- sentId++;
- }
continue;
}
@@ -172,6 +176,7 @@ public class CoreferenceChainScoringOutp
}
}
}
+
out.print(filename.getPath());
out.print('\t');
@@ -179,12 +184,23 @@ public class CoreferenceChainScoringOutp
out.print('\t');
out.print(tokenId++);
out.print('\t');
- out.print(token instanceof NewlineToken ? "Newline" : token.getCoveredText());
+ out.print(token instanceof NewlineToken ? "Newline" : TreeUtils.escapePunct(token.getCoveredText()));
out.print('\t');
out.print(token.getPartOfSpeech());
out.print('\t');
- // parse bit -- can ignore?
- out.print('-'); out.print('\t');
+ // parse bit -- assume flat parse
+ if(tokenId == 1){
+ out.print("(NOPARSE*");
+ // special case for one word sentences:
+ if(endSentToken){
+ out.print(")");
+ }
+ }else if(endSentToken){
+ out.print("*)");
+ }else{
+ out.print("*");
+ }
+ out.print('\t');
// predicate lemma -- can ignore?
out.print('-'); out.print('\t');
// predicate frameset id -- can ignore?
@@ -206,6 +222,7 @@ public class CoreferenceChainScoringOutp
buff.append(')');
buff.append('|');
}
+ endSet.remove(ind);
// endMention.remove(ind);
}
for(int ind : wholeMention){
@@ -241,9 +258,12 @@ public class CoreferenceChainScoringOutp
}else{
out.println("_");
}
-// }
-// out.println();
-// lastToken = token;
+
+ if(endSentToken){
+ out.println();
+ tokenId = 0;
+ sentId++;
+ }
}
if(!isGold){
icOut.println("#end document");
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java Mon Nov 9 14:48:12 2015
@@ -60,7 +60,11 @@ public class DeterministicMarkableAnnota
for(Segment seg : JCasUtil.select(jCas, Segment.class)){
for(ConllDependencyNode node : JCasUtil.selectCovered(jCas, ConllDependencyNode.class, seg)){
String nodeText = node.getCoveredText().toLowerCase();
- TerminalTreebankNode term = JCasUtil.selectCovered(TerminalTreebankNode.class, node).get(0);
+ List<TerminalTreebankNode> terms = JCasUtil.selectCovered(TerminalTreebankNode.class, node);
+ TerminalTreebankNode term = null;
+ if(terms.size() > 0){
+ term = terms.get(0);
+ }
if(node.getId() == 0){
continue;
@@ -71,7 +75,7 @@ public class DeterministicMarkableAnnota
// 1) get nouns, and expand the markable to the phrase they cover
// 2) get determiners like "this" and "these"
// 3) non-passive "it"
- if((node.getPostag().equals("NN") || node.getPostag().equals("NNS")) && term.getNodeType().startsWith("N")){
+ if(node.getPostag().startsWith("NN") && term != null && term.getNodeType().startsWith("N")){
if(node.getForm().matches("\\s+")) continue;
// TODO fix this godawful hack:
if(nodeText.equals("date") || nodeText.equals("tablet") || nodeText.equals("hg") || nodeText.equals("lb") || nodeText.equals("status")
@@ -83,7 +87,7 @@ public class DeterministicMarkableAnnota
int end = node.getEnd();
// if(node.getHead().getId() != 0){
List<ConllDependencyNode> progeny = getProgeny(node, getDependencyNodes(jCas, getSentence(jCas, node)));
- progeny = removeConjunctionNodes(node, progeny);
+ progeny = removeUnannotatedNodes(node, progeny);
if(progeny.size() > 0){
for(ConllDependencyNode child : progeny){
if(child.getBegin() < begin){
@@ -109,6 +113,12 @@ public class DeterministicMarkableAnnota
end = prevToken.getEnd();
}
}
+
+ Matcher m = headerPatt.matcher(nodeText);
+ if(m.find()){
+ begin = begin + m.end();
+ }
+
Markable markable = new Markable(jCas, begin, end);
markable.addToIndexes();
}else if(node.getPostag().equals("DT") && !node.getDeprel().equals("det")){
@@ -123,7 +133,9 @@ public class DeterministicMarkableAnnota
}
}
- private static List<ConllDependencyNode> removeConjunctionNodes(ConllDependencyNode originalNode,
+ // Post-process to remove those kinds of nodes which may or may not be correctly parsed but do not tend to align with gold annotated
+ // markables (and usually our intuitions as well, so it's not completely hacky).
+ private static List<ConllDependencyNode> removeUnannotatedNodes(ConllDependencyNode originalNode,
List<ConllDependencyNode> progeny) {
List<ConllDependencyNode> filtered = new ArrayList<>();
@@ -133,7 +145,8 @@ public class DeterministicMarkableAnnota
boolean blockedByConj = false;
for(ConllDependencyNode pathEl : DependencyUtility.getPath(progeny, node, originalNode)){
if(pathEl == originalNode) continue;
- if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",") || pathEl.getDeprel().equals("meta")){
+ if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",") || pathEl.getDeprel().equals("meta")
+ || pathEl.getCoveredText().matches("(([A-Z][\\.\\:\\)])|(#\\d+)|(\\d+[\\.\\:\\)]))")){
blockedByConj = true;
break;
}
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java Mon Nov 9 14:48:12 2015
@@ -477,10 +477,10 @@ public class EventCoreferenceAnnotator e
String category = super.classifier.classify(features);
- if(this.scoreAll && category.equals(NO_RELATION_CATEGORY)){
+ if(this.scoreAll){
Map<String,Double> scores = super.classifier.score(features);
- category = IDENTITY_RELATION;
this.lastScore = scores.get(IDENTITY_RELATION);
+ category = IDENTITY_RELATION;
}
return category;
}
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Mon Nov 9 14:48:12 2015
@@ -16,6 +16,8 @@ import org.apache.ctakes.core.util.ListF
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistanceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor;
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
@@ -24,8 +26,6 @@ import org.apache.ctakes.coreference.ae.
import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
import org.apache.ctakes.coreference.util.ClusterUtils;
import org.apache.ctakes.dependency.parser.util.DependencyUtility;
-import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
-import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator.IdentifiedAnnotationPair;
import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
@@ -55,6 +55,7 @@ import org.cleartk.ml.CleartkProcessingE
import org.cleartk.ml.DataWriter;
import org.cleartk.ml.Feature;
import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
import org.cleartk.ml.jar.DefaultDataWriterFactory;
import org.cleartk.ml.jar.DirectoryDataWriterFactory;
import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -82,7 +83,7 @@ public class MentionClusterCoreferenceAn
MentionClusterCoreferenceAnnotator.class,
CleartkAnnotator.PARAM_IS_TRAINING,
true,
- RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
downsamplingRate,
DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
dataWriterClass,
@@ -100,8 +101,9 @@ public class MentionClusterCoreferenceAn
modelPath);
}
- private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> featureExtractors = this.getFeatureExtractors();
-
+ private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+ private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+
private Set<String> markableStrings = null;
private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
private Map<String,Set<Markable>> headWordMarkables = null;
@@ -116,6 +118,7 @@ public class MentionClusterCoreferenceAn
extractors.add(new MentionClusterDepHeadExtractor());
extractors.add(new MentionClusterStackFeaturesExtractor());
extractors.add(new MentionClusterSalienceFeaturesExtractor());
+// extractors.add(new MentionClusterDistanceFeaturesExtractor());
try {
extractors.add(new MentionClusterDistSemExtractor());
@@ -127,22 +130,29 @@ public class MentionClusterCoreferenceAn
return extractors;
}
+ protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+ List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+ // mention features from pairwise system:
+ extractors.add(new MentionClusterMentionFeaturesExtractor());
+
+ return extractors;
+ }
+
protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
JCas jcas,
IdentifiedAnnotation mention){
int sentDist = 5;
// using linked hash set ensures no duplicates:
LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
- if(mention.getCoveredText().equalsIgnoreCase("this")){
- pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
- pairs.addAll(getClusterPairs(jcas, mention, 3));
- }else{
+// if(mention.getCoveredText().equalsIgnoreCase("this")){
+// pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
+// pairs.addAll(getClusterPairs(jcas, mention, 3));
+// }else{
pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
-// pairs.addAll(getExactStringMatchPairs(jcas, mention, sentDist));
pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
- }
+// }
return pairs;
}
@@ -189,6 +199,12 @@ public class MentionClusterCoreferenceAn
JCas jcas, IdentifiedAnnotation mention, int sentDist) {
List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+ Annotation first = (Annotation) members.getHead();
+ if(first == null || mention.getBegin() <= first.getEnd()){
+ continue;
+ }
+
IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
continue;
@@ -298,6 +314,10 @@ public class MentionClusterCoreferenceAn
List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+ if(headNode == null){
+ Logger.getLogger(MentionClusterCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+ return pairs;
+ }
String head = headNode.getCoveredText().toLowerCase();
if(headWordMarkables.containsKey(head)){
Set<Markable> headSet = headWordMarkables.get(head);
@@ -324,7 +344,7 @@ public class MentionClusterCoreferenceAn
markableStrings = new HashSet<>();
nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
headWordMarkables = new HashMap<>();
- pairScores = getMarkablePairScores(jCas);
+// pairScores = getMarkablePairScores(jCas);
Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
relationLookup = new HashMap<>();
@@ -352,6 +372,7 @@ public class MentionClusterCoreferenceAn
for(Segment segment : JCasUtil.select(jCas, Segment.class)){
for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
String mentionText = mention.getCoveredText().toLowerCase();
boolean singleton = true;
double maxScore = 0.0;
@@ -361,7 +382,7 @@ public class MentionClusterCoreferenceAn
CollectionTextRelation cluster = pair.getCluster();
// apply all the feature extractors to extract the list of features
List<Feature> features = new ArrayList<>();
- for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.featureExtractors) {
+ for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
List<Feature> feats = extractor.extract(jCas, cluster, mention);
if (feats != null){
// Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
@@ -369,6 +390,10 @@ public class MentionClusterCoreferenceAn
}
}
+ for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+ features.addAll(extractor.extract(jCas, mention));
+ }
+
// here is where feature conjunctions can go (dupFeatures)
List<Feature> dupFeatures = new ArrayList<>();
// sanity check on feature values
@@ -378,15 +403,43 @@ public class MentionClusterCoreferenceAn
String message = String.format("Null value found in %s from %s", feature, features);
System.err.println(message);
// throw new IllegalArgumentException(String.format(message, feature, features));
-// }else{
-// if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
-// dupFeatures.add(new Feature("PRO+"+feature.getName(), feature.getValue()));
-// }
+ }else{
+ String prefix = null;
+ // Durret and Klein style feature conjunctions: pronoun type or pos tag. maybe try umls semantic-type?
+ /*
+ if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+ prefix = "PRO_"+mentionText;
+ }else if(headNode != null && headNode.getPostag() != null){
+ prefix = headNode.getPostag();
+ }else{
+ prefix = "UNK";
+ }
+ */
+ // headword-based feature conjunctions
+/* if(headNode != null && headNode.getCoveredText() != null && headMatches(headNode.getCoveredText().toLowerCase(), features)){
+ prefix = "HEAD_MATCH";
+ }else{
+ prefix = "NO_HEAD_MATCH";
+ }
+*/
+
+ // UMLS semantic type feature conjunctions
+ for(Feature feat : features){
+ if(feat.getName().startsWith("ClusterSemType")){
+ dupFeatures.add(new Feature(feat.getName()+"_"+feature.getName(), feature.getValue()));
+ }
+ }
+
+ if(prefix != null){
+ dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+ }
}
}
features.addAll(dupFeatures);
+
// pairwise score features:
+ /*
double minPairScore = 1.0;
double maxPairScore = 0.0;
double avePairScore = 0.0;
@@ -400,6 +453,9 @@ public class MentionClusterCoreferenceAn
markablePair = new HashableArguments(mention, member);
score = pairScores.get(markablePair);
}
+ if(score == null){
+ score = 0.0;
+ }
if(score != null){
avePairScore += score;
if(score > maxPairScore){
@@ -410,17 +466,19 @@ public class MentionClusterCoreferenceAn
}
}
}
+
features.add(new Feature("PAIRWISE_MAX", maxPairScore));
- features.add(new Feature("PAIRWISE_MIN", minPairScore));
- if(numPairs > 0){
- avePairScore /= numPairs;
- }else{
- avePairScore = 0.0;
- }
- if(Double.isNaN(avePairScore)){
- Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average feature found with value NaN");
- }
- features.add(new Feature("PAIRWISE_AVE", avePairScore));
+ */
+// features.add(new Feature("PAIRWISE_MIN", minPairScore));
+// if(numPairs > 0){
+// avePairScore /= numPairs;
+// }else{
+// avePairScore = 0.0;
+// }
+// if(Double.isNaN(avePairScore)){
+// Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average feature found with value NaN");
+// }
+// features.add(new Feature("PAIRWISE_AVE", avePairScore));
// during training, feed the features to the data writer
if (this.isTraining()) {
@@ -468,7 +526,6 @@ public class MentionClusterCoreferenceAn
markableStrings.add(mention.getCoveredText().toLowerCase());
- ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
if(headNode != null){
String head = headNode.getCoveredText().toLowerCase();
if(!headWordMarkables.containsKey(head)){
@@ -491,6 +548,21 @@ public class MentionClusterCoreferenceAn
}
}
}
+
+ removeSingletonClusters(jCas);
+ }
+
+ private boolean headMatches(String head, List<Feature> feats){
+ boolean match = false;
+ for(Feature feat : feats){
+ if(feat.getName().equals("ClusterHeadMatchesMentionHead")){
+ if(feat.getValue().equals(true)){
+ match = true;
+ }
+ break;
+ }
+ }
+ return match;
}
/**
@@ -562,6 +634,20 @@ public class MentionClusterCoreferenceAn
}
+ private void removeSingletonClusters(JCas jcas){
+ List<CollectionTextRelation> toRemove = new ArrayList<>();
+ for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+ if(head.getTail() instanceof EmptyFSList){
+ toRemove.add(rel);
+ }
+ }
+
+ for(CollectionTextRelation rel : toRemove){
+ rel.removeFromIndexes();
+ }
+ }
+
private static final boolean dominates(Annotation arg1, Annotation arg2) {
return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
}
Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java Mon Nov 9 14:48:12 2015
@@ -0,0 +1,655 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+import org.cleartk.ml.svmlight.rank.QidInstance;
+import org.cleartk.util.ViewUriUtil;
+
+public class MentionClusterRankingCoreferenceAnnotator extends CleartkAnnotator<Double> {
+ public static final String NO_RELATION_CATEGORY = "-NONE-";
+ public static final String CLUSTER_RELATION_CATEGORY = "CoreferenceClusterMember";
+
+ public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
+ "ProbabilityOfKeepingANegativeExample";
+ @ConfigurationParameter(
+ name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ mandatory = false,
+ description = "probability that a negative example should be retained for training")
+ protected double probabilityOfKeepingANegativeExample = 0.5;
+
+ protected Random coin = new Random(0);
+
+ boolean greedyFirst = true;
+
+ private int qid = 0;
+
+ public static AnalysisEngineDescription createDataWriterDescription(
+ Class<? extends DataWriter<?>> dataWriterClass,
+ File outputDirectory,
+ float downsamplingRate) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ MentionClusterRankingCoreferenceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ true,
+ MentionClusterRankingCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ downsamplingRate,
+ DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+ dataWriterClass,
+ DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+ outputDirectory);
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription(
+ String modelPath) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ MentionClusterRankingCoreferenceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath);
+ }
+
+ private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+ private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+
+ private Set<String> markableStrings = null;
+ private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+ private Map<String,Set<Markable>> headWordMarkables = null;
+ private Map<HashableArguments,Double> pairScores = null;
+
+ protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
+ List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
+ extractors.add(new MentionClusterAgreementFeaturesExtractor());
+ extractors.add(new MentionClusterStringFeaturesExtractor());
+ extractors.add(new MentionClusterSectionFeaturesExtractor());
+ extractors.add(new MentionClusterUMLSFeatureExtractor());
+ extractors.add(new MentionClusterDepHeadExtractor());
+ extractors.add(new MentionClusterStackFeaturesExtractor());
+ extractors.add(new MentionClusterSalienceFeaturesExtractor());
+// extractors.add(new MentionClusterDistanceFeaturesExtractor());
+
+ try {
+ extractors.add(new MentionClusterDistSemExtractor());
+ extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return extractors;
+ }
+
+ protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+ List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+ // mention features from pairwise system:
+ extractors.add(new MentionClusterMentionFeaturesExtractor());
+
+ return extractors;
+ }
+
+ protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+ JCas jcas,
+ IdentifiedAnnotation mention){
+ int sentDist = 5;
+ // using linked hash set ensures no duplicates:
+ LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
+// if(mention.getCoveredText().equalsIgnoreCase("this")){
+// pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
+// pairs.addAll(getClusterPairs(jcas, mention, 3));
+// }else{
+ pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
+ pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
+ pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
+ pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
+// }
+
+ return pairs;
+ }
+
+ /*
+ * getExactStringMatchPairs()
+ * For mentions that have the exact string repeated elsewhere in the document we want to
+ * allow matching across any distance. We don't use the sentence distance parameter here.
+ * We make use of a global variable markableStrings that is a HashSet containig all the markable
+ * strings from this document.
+ */
+ private List<CollectionTextRelationIdentifiedAnnotationPair> getExactStringMatchPairs(
+ JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+ List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+ if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+ for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+ if(mostRecent == null) continue;
+
+ for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ if(m == mostRecent) break;
+ // see if any of the members of the cluster have the exact same string as this
+ if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+ pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ break;
+ }
+ }
+ }
+ }
+ return pairs;
+ }
+
+ /*
+ * getClusterPairs()
+ * In this method we allow to link to clusters containing more than one mention even if they
+ * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+ * is within the specified sentence distance (presumably longer than the sentence distance passed into
+ * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+ * members but only one before the focus mention. So we need to count the members of a cluster until we
+ * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+ */
+ private List<CollectionTextRelationIdentifiedAnnotationPair> getClusterPairs(
+ JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+ List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+ for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+ Annotation first = (Annotation) members.getHead();
+ if(first == null || mention.getBegin() <= first.getEnd()){
+ continue;
+ }
+
+ IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+ if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+ continue;
+ }
+ int numMembers=0;
+ for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ numMembers++;
+ if(m == mostRecent) break;
+ }
+ if(numMembers > 1){
+ pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ }
+ }
+
+ return pairs;
+ }
+
+ /*
+ * Here we want to add only things that are nearby. First we check the semantic types
+ * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+ * we add the cluster no matter what. Otherwise we check how many sentences are in between
+ * the mention and the latest element of the cluster.
+ */
+ protected List<CollectionTextRelationIdentifiedAnnotationPair> getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+ List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+ Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+
+ for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+ Annotation first = (Annotation) members.getHead();
+ if(first == null || mention.getBegin() <= first.getEnd()) continue;
+
+ // check for distance if they are not anatomical site or medication
+ if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+ bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+
+ IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+ if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue;
+ }
+
+ // check for types of cluster
+ Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+ if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+ boolean overlap = false;
+ for(String semType : bestAnaTypes){
+ if(bestClusterTypes.contains(semType)){
+ overlap = true;
+ }
+ }
+ // they both correspond to named entities but no overlap in which category of named entity.
+ if(!overlap){
+ continue;
+ }
+ }
+ pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ }
+ return pairs;
+ }
+
+ /*
+ * getSectionHeaderPairs()
+ * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+ * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a
+ * span only contains one sentence then we consider it a "header" (or also as important a list item).
+ * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+ * the "sentence distance" method.
+ */
+ protected List<CollectionTextRelationIdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+ List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+ for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+ Annotation first = (Annotation) members.getHead();
+ if(first == null || mention.getBegin() <= first.getEnd()){
+ continue;
+ }
+
+ // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+ IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+ if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+ continue;
+ }
+
+ // now check if any of the mentions are in a section header
+ List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+ for(int j = 0; j < pars.size(); j++){
+ boolean match = false;
+ Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+ List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+ if(coveredSents != null && coveredSents.size() == 1){
+ // this is sentences that are the same span as paragraphs -- how we model section headers
+ // see if any of the cluster mentions are in the section header
+ for(Markable m : JCasUtil.select(members, Markable.class)){
+ if(dominates(par, m)){
+ pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ match = true;
+ break;
+ }
+ }
+ }
+ if(match) break;
+ }
+ }
+ return pairs;
+ }
+
+ protected List<CollectionTextRelationIdentifiedAnnotationPair> getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+ List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+ if(headNode == null){
+ Logger.getLogger(MentionClusterRankingCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+ return pairs;
+ }
+ String head = headNode.getCoveredText().toLowerCase();
+ if(headWordMarkables.containsKey(head)){
+ Set<Markable> headSet = headWordMarkables.get(head);
+ for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+ if(mostRecent == null) continue;
+ for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ if(headSet.contains(mostRecent)){
+ pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ break;
+ }
+ if(m == mostRecent) break;
+ }
+ }
+ }
+
+ return pairs;
+ }
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+ // lookup from pair of annotations to binary text relation
+ // note: assumes that there will be at most one relation per pair
+ markableStrings = new HashSet<>();
+ nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+ headWordMarkables = new HashMap<>();
+// pairScores = getMarkablePairScores(jCas);
+
+ Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
+ relationLookup = new HashMap<>();
+ if (this.isTraining()) {
+ for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
+ for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ CollectionTextRelationIdentifiedAnnotationRelation relation =
+ new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+ relation.setCluster(cluster);
+ relation.setMention(mention);
+ relation.setCategory("CoreferenceClusterMember");
+ relation.addToIndexes();
+ // The key is a list of args so we can do bi-directional lookup
+ CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention);
+ if(relationLookup.containsKey(key)){
+ String cat = relationLookup.get(key).getCategory();
+ System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString());
+ System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText());
+ }
+ relationLookup.put(key, relation);
+ }
+ }
+ }
+
+
+ for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+ for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+ String mentionText = mention.getCoveredText().toLowerCase();
+ boolean singleton = true;
+ double maxScore = 0.0;
+ CollectionTextRelation maxCluster = null;
+
+ for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
+ CollectionTextRelation cluster = pair.getCluster();
+ // apply all the feature extractors to extract the list of features
+ List<Feature> features = new ArrayList<>();
+ for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
+ List<Feature> feats = extractor.extract(jCas, cluster, mention);
+ if (feats != null){
+// Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
+ features.addAll(feats);
+ }
+ }
+
+ for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+ features.addAll(extractor.extract(jCas, mention));
+ }
+
+ // here is where feature conjunctions can go (dupFeatures)
+ List<Feature> dupFeatures = new ArrayList<>();
+ // sanity check on feature values
+ for (Feature feature : features) {
+ if (feature.getValue() == null) {
+ feature.setValue("NULL");
+ String message = String.format("Null value found in %s from %s", feature, features);
+ System.err.println(message);
+ // throw new IllegalArgumentException(String.format(message, feature, features));
+ }else{
+ String prefix = "";
+ if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+ prefix = "PRO_"+mentionText;
+ }else if(headNode != null && headNode.getPostag() != null){
+ prefix = headNode.getPostag();
+ }else{
+ prefix = "UNK";
+ }
+ dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+ }
+ }
+ features.addAll(dupFeatures);
+
+ // during training, feed the features to the data writer
+ // create a classification instance and write it to the training data
+
+ if (this.isTraining()) {
+ String category = this.getRelationCategory(relationLookup, cluster, mention);
+ if (category == null) {
+ continue;
+ }
+ double outVal = 1.0;
+ if(category.equals(NO_RELATION_CATEGORY)){
+ outVal = 0.0;
+ }
+
+ QidInstance<Double> inst = new QidInstance<>();
+ inst.setQid(String.valueOf(qid));
+ inst.addAll(features);
+ inst.setOutcome(outVal);
+ this.dataWriter.write(inst);
+ if(!category.equals(NO_RELATION_CATEGORY)){
+ break;
+ }
+ }
+
+ // during classification feed the features to the classifier and create
+ // annotations
+ else {
+ Double prediction = this.classify(features);
+ if(prediction > maxScore){
+ maxScore = prediction;
+ maxCluster = cluster;
+ }
+ }
+ }
+
+ markableStrings.add(mention.getCoveredText().toLowerCase());
+
+ if(headNode != null){
+ String head = headNode.getCoveredText().toLowerCase();
+ if(!headWordMarkables.containsKey(head)){
+ headWordMarkables.put(head, new HashSet<Markable>());
+ }
+ headWordMarkables.get(head).add(mention);
+ }
+
+ // if we got this far and never matched up the
+ if(maxScore > 0){
+ createRelation(jCas, maxCluster, mention, CLUSTER_RELATION_CATEGORY);
+ }else{
+ // make the markable it's own cluster:
+ CollectionTextRelation chain = new CollectionTextRelation(jCas);
+ NonEmptyFSList list = new NonEmptyFSList(jCas);
+ list.setHead(mention);
+ list.setTail(new EmptyFSList(jCas));
+ chain.setMembers(list);
+ chain.addToIndexes();
+ list.addToIndexes();
+ list.getTail().addToIndexes();
+ }
+ qid++;
+ }
+ }
+
+ removeSingletonClusters(jCas);
+ }
+
+ /**
+ * Looks up the arguments in the specified lookup table and converts the
+ * relation into a label for classification
+ *
+ * @return If this category should not be processed for training return
+ * <i>null</i> otherwise it returns the label sent to the datawriter
+ */
+ protected String getRelationCategory(
+ Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup,
+ CollectionTextRelation cluster,
+ IdentifiedAnnotation mention) {
+ CollectionTextRelationIdentifiedAnnotationRelation relation =
+ relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+ String category;
+ if (relation != null) {
+ category = relation.getCategory();
+ } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
+ category = NO_RELATION_CATEGORY;
+ } else {
+ category = null;
+ }
+ return category;
+ }
+
+ /**
+ * Predict an outcome given a set of features. By default, this simply
+ * delegates to the object's <code>classifier</code>. Subclasses may override
+ * this method to implement more complex classification procedures.
+ *
+ * @param features
+ * The features to be classified.
+ * @return The predicted outcome (label) for the features.
+ */
+ protected Double classify(List<Feature> features) throws CleartkProcessingException {
+ return this.classifier.classify(features);
+ }
+
+ /**
+ * Create a UIMA relation type based on arguments and the relation label. This
+ * allows subclasses to create/define their own types: e.g. coreference can
+ * create CoreferenceRelation instead of BinaryTextRelation
+ *
+ * @param jCas
+ * - JCas object, needed to create new UIMA types
+ * @param arg1
+ * - First argument to relation
+ * @param arg2
+ * - Second argument to relation
+ * @param predictedCategory
+ * - Name of relation
+ */
+ protected void createRelation(
+ JCas jCas,
+ CollectionTextRelation cluster,
+ IdentifiedAnnotation mention,
+ String predictedCategory) {
+ // add the relation to the CAS
+ CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+ relation.setCluster(cluster);
+ relation.setMention(mention);
+ relation.setCategory(predictedCategory);
+ relation.addToIndexes();
+
+// RelationArgument arg = new RelationArgument(jCas);
+// arg.setArgument(mention);
+ ListFactory.append(jCas, cluster.getMembers(), mention);
+ }
+
+
+ private void removeSingletonClusters(JCas jcas){
+ List<CollectionTextRelation> toRemove = new ArrayList<>();
+ for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+ if(head.getTail() instanceof EmptyFSList){
+ toRemove.add(rel);
+ }
+ }
+
+ for(CollectionTextRelation rel : toRemove){
+ rel.removeFromIndexes();
+ }
+ }
+
+ private static final boolean dominates(Annotation arg1, Annotation arg2) {
+ return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+ }
+
+ public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+ Set<String> semTypes = new HashSet<>();
+ for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ semTypes.addAll(getBestEnt(jcas, member));
+ }
+ return semTypes;
+ }
+
+ public Set<String> getBestEnt(JCas jcas, Markable markable){
+ Set<String> bestEnts = new HashSet<>();
+ IdentifiedAnnotation bestEnt = null;
+ Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+ ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+ Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+ for(IdentifiedAnnotation ent : coveringEnts){
+ if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+ ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+ if(entHead == head){
+ if(bestEnt == null){
+ bestEnt = ent;
+ }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+ // if the span of this entity is bigger than the biggest existing one:
+ bestEnt = ent;
+ otherBestEnts = new HashSet<>();
+ }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+ // there is another one with the exact same span and possibly different type!
+ otherBestEnts.add(ent);
+ }
+ }
+ }
+
+ if(bestEnt!=null){
+ bestEnts.add(bestEnt.getClass().getSimpleName());
+ for(IdentifiedAnnotation other : otherBestEnts){
+ bestEnts.add(other.getClass().getSimpleName());
+ }
+ }
+ return bestEnts;
+ }
+
+
+ public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
+ Map<HashableArguments, Double> scoreMap = new HashMap<>();
+ for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){
+ HashableArguments pair = new HashableArguments((IdentifiedAnnotation)reln.getArg1().getArgument(), (IdentifiedAnnotation)reln.getArg2().getArgument());
+ scoreMap.put(pair, reln.getConfidence());
+ }
+ return scoreMap;
+ }
+
+ public static class CollectionTextRelationIdentifiedAnnotationPair {
+ private final CollectionTextRelation cluster;
+ private final IdentifiedAnnotation mention;
+
+ public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){
+ this.cluster = cluster;
+ this.mention = mention;
+ }
+
+ public final CollectionTextRelation getCluster(){
+ return this.cluster;
+ }
+
+ public final IdentifiedAnnotation getMention(){
+ return this.mention;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj;
+ return (this.cluster == other.cluster &&
+ this.mention == other.mention);
+ }
+
+ @Override
+ public int hashCode() {
+ return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode());
+ }
+ }
+
+}
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java Mon Nov 9 14:48:12 2015
@@ -20,12 +20,15 @@ public class TemporalFeatureExtractor im
List<Feature> feats = new ArrayList<>();
String a1dtr = getDocTimeRelForArg(jCas, arg1);
- feats.add(new Feature("Arg1DTR_" + a1dtr, true));
String a2dtr = getDocTimeRelForArg(jCas, arg2);
+
+ feats.add(new Feature("Arg1DTR_" + a1dtr, true));
feats.add(new Feature("Arg2DTR_" + a2dtr, true));
- if(a1dtr.equals(a2dtr) && !a1dtr.equals("NA")){
- feats.add(new Feature("DTR_Match", true));
+ if(a1dtr.equals(a2dtr)){
+ if(!a1dtr.equals("NA")){
+ feats.add(new Feature("DTR_Match", true));
+ }
}
return feats;
Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java Mon Nov 9 14:48:12 2015
@@ -0,0 +1,46 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDistanceFeaturesExtractor
+ implements RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+ @Override
+ public List<Feature> extract(JCas jCas, CollectionTextRelation cluster, IdentifiedAnnotation mention)
+ throws AnalysisEngineProcessException {
+ List<Feature> feats = new ArrayList<>();
+
+ int minDistance = Integer.MAX_VALUE;
+ int neMinDistance = Integer.MAX_VALUE;
+ int sentMinDistance = Integer.MAX_VALUE;
+
+ for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+ int dist = JCasUtil.selectBetween(BaseToken.class, member, mention).size();
+ minDistance = Math.min(minDistance, dist);
+
+ int neDist = JCasUtil.selectBetween(Markable.class, member, mention).size();
+ neMinDistance = Math.min(neMinDistance, neDist);
+
+ int sentDist = JCasUtil.selectBetween(Sentence.class, member, mention).size();
+ sentMinDistance = Math.min(sentMinDistance, sentDist);
+ }
+ feats.add(new Feature("MinTokenDistance", minDistance / 4000.0));
+ feats.add(new Feature("MinMarkableDistance", neMinDistance / 900.0));
+ feats.add(new Feature("MinSentDistance", sentMinDistance / 350.0));
+
+ return feats;
+ }
+
+}
Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java Mon Nov 9 14:48:12 2015
@@ -0,0 +1,76 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.FirstCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.LastCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.DistanceExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.NamingExtractor1;
+import org.cleartk.ml.feature.extractor.TypePathExtractor;
+
+public class MentionClusterMentionFeaturesExtractor implements FeatureExtractor1<Markable> {
+
+ private FeatureExtractor1<BaseToken> coveredText = new CoveredTextExtractor<>();
+ private FeatureExtractor1<Markable> tokenContext = new CleartkExtractor<Markable,BaseToken>(
+ BaseToken.class,
+ coveredText,
+ new FirstCovered(1),
+ new LastCovered(1),
+ new Bag(new Covered()),
+ new Preceding(3),
+ new Following(3));
+
+ private FeatureExtractor1<BaseToken> pos = new TypePathExtractor<>(BaseToken.class, "partOfSpeech");
+
+ /**
+ * All part-of-speech tags of the mention as a bag
+ */
+ private FeatureExtractor1<Markable> tokenPOS = new CleartkExtractor<Markable,BaseToken>(
+ BaseToken.class,
+ pos,
+ new Bag(new Covered()));
+
+ /**
+ * All extractors for mention 1, with features named to distinguish them from mention 2
+ */
+ private FeatureExtractor1<Markable> mentionFeaturesExtractor = new NamingExtractor1<>(
+ "mention1pos",
+ tokenPOS);
+
+ @Override
+ public List<Feature> extract(JCas view, Markable focusAnnotation) throws CleartkExtractorException {
+ List<Feature> feats = new ArrayList<>();
+
+ // token features:
+ feats.addAll(tokenContext.extract(view, focusAnnotation));
+
+// feats.add(new Feature("NumCoveredTokens_" + JCasUtil.selectCovered(BaseToken.class, focusAnnotation).size()));
+
+ // pos features:
+// feats.addAll(mentionFeaturesExtractor.extract(view, focusAnnotation));
+
+// feats.addAll(DependencyTreeFeaturesExtractor.extractForNode(view, focusAnnotation, "dep"));
+
+ return feats;
+ }
+
+}