You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/06/16 14:51:51 UTC
svn commit: r1748736 [1/5] - in /ctakes/trunk/ctakes-coreference: ./
src/main/java/org/apache/ctakes/coreference/ae/
src/main/java/org/apache/ctakes/coreference/ae/features/
src/main/java/org/apache/ctakes/coreference/ae/features/cluster/
src/main/java...
Author: tmill
Date: Thu Jun 16 14:51:51 2016
New Revision: 1748736
URL: http://svn.apache.org/viewvc?rev=1748736&view=rev
Log:
Added cleartk-based coreference models to coref module along with eval code.
Added:
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/PersonChainAnnotator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/CorefSyntaxFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/SectionFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAgreementFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeVectorExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/ClinicalFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/GrammaticalRoleFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/MorphosyntacticFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSalience.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/extractors/
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java
Modified:
ctakes/trunk/ctakes-coreference/pom.xml
Modified: ctakes/trunk/ctakes-coreference/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/pom.xml?rev=1748736&r1=1748735&r2=1748736&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/pom.xml (original)
+++ ctakes/trunk/ctakes-coreference/pom.xml Thu Jun 16 14:51:51 2016
@@ -78,6 +78,14 @@
<artifactId>ctakes-resources-umls2011ab</artifactId>
<version>3.1.1</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-assertion</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-temporal</artifactId>
+ </dependency>
</dependencies>
<build>
<plugins>
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,297 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+
+public class CoreferenceChainScoringOutput extends JCasAnnotator_ImplBase{
+ public static final String PARAM_OUTPUT_FILENAME = "OutputDirectory";
+ @ConfigurationParameter(
+ name = PARAM_OUTPUT_FILENAME,
+ mandatory = true,
+ description = "Name of chain file in CoNLL format"
+ )
+ private String outputFilename;
+ private PrintWriter out = null;
+ private PrintWriter icOut = null;
+
+ public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
+ @ConfigurationParameter(
+ name = PARAM_GOLD_VIEW_NAME,
+ mandatory = false,
+ description = "Name of gold view in jcas"
+ )
+ private String goldViewName = null;
+ boolean isGold;
+
+ private int docNum = 0;
+
+ @Override
+ public void initialize(final UimaContext context) throws ResourceInitializationException{
+ super.initialize(context);
+
+ try {
+ out = new PrintWriter(outputFilename);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ throw new ResourceInitializationException(e);
+ }
+
+ if(goldViewName != null) isGold = true;
+ else{
+ isGold = false;
+ try {
+ icOut = new PrintWriter(outputFilename + ".icarus");
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ throw new ResourceInitializationException(e);
+ }
+ }
+ }
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+ File filename = new File(ViewUriUtil.getURI(jCas));
+ JCas chainsCas = null;
+ try {
+ chainsCas = goldViewName != null ? jCas.getView(goldViewName) : jCas;
+ } catch (CASException e) {
+ e.printStackTrace();
+ throw new AnalysisEngineProcessException(e);
+ }
+ int chainNum = 1;
+ HashMap<Annotation, Integer> ent2chain = new HashMap<>();
+
+ if(isGold) System.out.println("\nGold chains:");
+ else{
+ icOut.println(String.format("#begin document (%s); part 000", filename.getPath()));
+ System.out.println("\nSystem chains:");
+ }
+
+
+ Collection<CollectionTextRelation> rels = JCasUtil.select(chainsCas, CollectionTextRelation.class);
+ if(rels.size() == 0){
+ return;
+ }
+
+ // build a map from every markable that is in a chain to the chain number it is in (number is not important as long as they are
+ // distinct so we just number them in the order the uima annotation in gives them to us)
+ // This has to be reverse compatible with older coref module that added RelationArguments to a chain instead of Markables.
+ // So we grab the chain elements, check their type, then grab the markable annotation depending on that type.
+ for(CollectionTextRelation chain : rels){
+ FSList members = chain.getMembers();
+ // if we are doing cluster-mention coreference, some clusters will be singletons, we do not use those in conll scoring
+ if(members instanceof NonEmptyFSList &&
+ ((NonEmptyFSList)members).getTail() instanceof EmptyFSList) continue;
+
+ while(members instanceof NonEmptyFSList){
+ TOP head = ((NonEmptyFSList) members).getHead();
+ Annotation mention = null;
+ if(head instanceof Annotation){
+ mention = (Annotation) head;
+ }else{
+ mention = ((RelationArgument)head).getArgument();
+ }
+// Annotation mention = (Annotation) ((NonEmptyFSList) members).getHead();
+ ent2chain.put(mention, chainNum);
+ members = ((NonEmptyFSList)members).getTail();
+ System.out.print("Mention: " + mention.getCoveredText());
+ System.out.print(" (" + mention.getBegin() + ", " + mention.getEnd() + ")");
+ System.out.print(" -----> ");
+ }
+ System.out.println();
+ chainNum++;
+ }
+
+ // Here we are using newline tokens to delimit sentences because the sentence
+ // breaks that cTAKES creates may not be correct and some gold markables might
+ // wrap sentences which might be confusing to the consumer of this file.
+ out.println("#begin document (" + filename.getPath() + "); part 000");
+ List<BaseToken> tokens = new ArrayList<>(JCasUtil.select(jCas, BaseToken.class));
+ Multiset<Integer> endSet = HashMultiset.create();
+ int tokenId = 0;
+ int sentId = 0;
+ BaseToken nextToken = tokens.get(0);
+
+ for(int i = 0; i < tokens.size(); i++){
+ boolean endSentToken = false;
+ BaseToken token = nextToken;
+ if(i+1 < tokens.size()){
+ nextToken = tokens.get(i+1);
+ if(nextToken instanceof NewlineToken || (token.getCoveredText().equals(".") && !(endSet.size() > 0))){
+ endSentToken = true;
+ }
+ }
+
+ // if we see a newline token at the end of a sentence break the sentence
+ // only print out if we are not at the start of the sentence:
+ if(token instanceof NewlineToken){
+ continue;
+ }
+
+ int lastInd = token.getEnd();
+ // fix for some bad tokenization
+ if(token.getCoveredText().length() > 1 && token.getCoveredText().endsWith(".")){
+ lastInd = token.getEnd()-1;
+ }
+ List<Markable> markables = new ArrayList<>(JCasUtil.selectCovering(chainsCas, Markable.class, token.getBegin(), lastInd));
+ List<Annotation> startMention = new ArrayList<>();
+ Multiset<Integer> endMention = HashMultiset.create();
+ List<Integer> wholeMention = new ArrayList<>();
+
+ for(Annotation markable : markables){
+ if(ent2chain.containsKey(markable)){
+ if(markable.getBegin() == token.getBegin()){
+ if(markable.getEnd() == token.getEnd()){
+ wholeMention.add(ent2chain.get(markable));
+ }else{
+ startMention.add(markable);
+ }
+ }else if(markable.getEnd() <= token.getEnd()){
+ if(endMention.contains(ent2chain.get(markable))){
+ System.err.println("There is a duplicate element -- should be handled by multiset");
+ }
+ if(markable.getEnd() < token.getEnd()){
+ System.err.println("There is a markable that ends in the middle of a token!");
+ }
+ endMention.add(ent2chain.get(markable));
+ }
+
+ if(!isGold){
+ icOut.println(String.format("%d-%d-%d\n", sentId, markable.getBegin(), markable.getEnd()));
+ }
+ }
+ }
+
+
+ out.print(filename.getPath());
+ out.print('\t');
+ out.print(docNum);
+ out.print('\t');
+ out.print(tokenId++);
+ out.print('\t');
+ out.print(token instanceof NewlineToken ? "Newline" : TreeUtils.escapePunct(token.getCoveredText()));
+ out.print('\t');
+ out.print(token.getPartOfSpeech());
+ out.print('\t');
+ // parse bit -- assume flat parse
+ if(tokenId == 1){
+ out.print("(NOPARSE*");
+ // special case for one word sentences:
+ if(endSentToken){
+ out.print(")");
+ }
+ }else if(endSentToken){
+ out.print("*)");
+ }else{
+ out.print("*");
+ }
+ out.print('\t');
+ // predicate lemma -- can ignore?
+ out.print('-'); out.print('\t');
+ // predicate frameset id -- can ignore?
+ out.print('-'); out.print('\t');
+ // word sense
+ out.print('-'); out.print('\t');
+ // speaker/author
+ out.print('-'); out.print('\t');
+ // named entities
+ out.print('*'); out.print('\t');
+
+ StringBuffer buff = new StringBuffer();
+// while(endStack.size() > 0 && endMention.contains(endStack.peek())){
+ for(int ind : endMention){
+// int ind = endStack.pop();
+// int ind = endMention.get(j);
+ if(endSet.contains(ind)){
+ buff.append(ind);
+ buff.append(')');
+ buff.append('|');
+ }
+ endSet.remove(ind);
+// endMention.remove(ind);
+ }
+ for(int ind : wholeMention){
+ buff.append('(');
+ buff.append(ind);
+ buff.append(')');
+ buff.append('|');
+ }
+ // sort start mentions by ordering of ending
+ while(startMention.size() > 0){
+ int ind;
+ Annotation latestEnd = null;
+ for(int j = 0; j < startMention.size(); j++){
+ if(latestEnd == null || startMention.get(j).getEnd() > latestEnd.getEnd()){
+ latestEnd = startMention.get(j);
+ }
+ }
+ startMention.remove(latestEnd);
+ ind = ent2chain.get(latestEnd);
+ buff.append('(');
+ buff.append(ind);
+ buff.append('|');
+ endSet.add(ind);
+// endStack.push(ind);
+ }
+
+ // In some datasets markables end in the middle of a token -- this is a problem because our check above is for all markables that cover the
+ // current token. In this case the markable end will still be unused when we get to the end of the sentence. We'll just hack it by throwing
+ // them on the last token of the sentence.
+ if(endSentToken && endSet.size() > 0){
+ System.err.println("Error! There are opened markables that never closed! Putting them on the end of the sentence.");
+ for(int ind : endSet){
+ buff.append(ind);
+ buff.append(')');
+ buff.append('|');
+ }
+ endSet.clear();
+ }
+ if(buff.length() > 0){
+ out.println(buff.substring(0, buff.length()-1));
+ }else{
+ out.println("_");
+ }
+
+ if(endSentToken){
+ out.println();
+ tokenId = 0;
+ sentId++;
+ }
+ }
+ if(!isGold){
+ icOut.println("#end document");
+ }
+ out.println("#end document " + filename.getPath());
+ out.flush();
+ docNum++;
+ }
+}
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,214 @@
+package org.apache.ctakes.coreference.ae;
+
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getDependencyNodes;
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getProgeny;
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.getSentence;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class DeterministicMarkableAnnotator extends JCasAnnotator_ImplBase {
+
+ // list starters like A. or #1 or 3)
+ static Pattern headerPatt = Pattern.compile("^(([A-Z][\\.\\:\\)])|(#\\d+)|(\\d+[\\.\\:\\)])) *");
+
+ @Override
+ public void initialize(UimaContext uc) throws ResourceInitializationException{
+ super.initialize(uc);
+ }
+
+ @Override
+ public void process(JCas jCas)
+ throws AnalysisEngineProcessException {
+
+// createMarkablesUsingConstituencyTrees(jCas);
+ createMarkablesUsingDependencyTrees(jCas);
+
+ for(TimeMention timex : JCasUtil.select(jCas, TimeMention.class)){
+ boolean collision = false;
+ for(Markable other : JCasUtil.selectCovered(jCas, Markable.class, timex.getBegin(), timex.getEnd())){
+ if(other.getBegin() == timex.getBegin() && other.getEnd() == timex.getEnd()){
+ collision = true;
+ break;
+ }
+ }
+ if(!collision){
+ Markable m = new Markable(jCas, timex.getBegin(), timex.getEnd());
+ m.addToIndexes(jCas);
+ }
+ }
+ }
+
+ private static void createMarkablesUsingDependencyTrees(JCas jCas) {
+ for(Segment seg : JCasUtil.select(jCas, Segment.class)){
+ for(ConllDependencyNode node : JCasUtil.selectCovered(jCas, ConllDependencyNode.class, seg)){
+ String nodeText = node.getCoveredText().toLowerCase();
+ List<TerminalTreebankNode> terms = JCasUtil.selectCovered(TerminalTreebankNode.class, node);
+ TerminalTreebankNode term = null;
+ if(terms.size() > 0){
+ term = terms.get(0);
+ }
+
+ if(node.getId() == 0){
+ continue;
+ }
+ if(nodeText.matches("\\p{Punct}+")){
+ continue;
+ }
+ // 1) get nouns, and expand the markable to the phrase they cover
+ // 2) get determiners like "this" and "these"
+ // 3) non-passive "it"
+ if(node.getPostag().startsWith("NN") && term != null && term.getNodeType().startsWith("N")){
+ if(node.getForm().matches("\\s+")) continue;
+ // TODO fix this godawful hack:
+ if(nodeText.equals("date") || nodeText.equals("tablet") || nodeText.equals("hg") || nodeText.equals("lb") || nodeText.equals("status")
+ || nodeText.equals("capsule") || nodeText.equals("mg") || nodeText.equals("cm")){
+
+ continue;
+ }
+ int begin = node.getBegin();
+ int end = node.getEnd();
+// if(node.getHead().getId() != 0){
+ List<ConllDependencyNode> progeny = getProgeny(node, getDependencyNodes(jCas, getSentence(jCas, node)));
+ progeny = removeUnannotatedNodes(node, progeny);
+ if(progeny.size() > 0){
+ for(ConllDependencyNode child : progeny){
+ if(child.getBegin() < begin){
+ begin = child.getBegin();
+ }
+ if(child.getEnd() > end){
+ end = child.getEnd();
+ }
+ }
+ }
+// }
+ ConllDependencyNode parent = node.getHead();
+ if(parent != null && parent.getId() != 0){
+ // if parent is inside the bounds of the proposed markable prune it a bit.
+ if(parent.getBegin() < node.getBegin() && parent.getBegin() > begin){
+ // get the following token:
+ BaseToken nextToken = JCasUtil.selectFollowing(BaseToken.class, parent, 1).get(0);
+ begin = nextToken.getBegin();
+ }
+ // parent is after the current head node but before the proposed markable is meant to end:
+ if(parent.getEnd() > node.getEnd() && parent.getEnd() < end){
+ BaseToken prevToken = JCasUtil.selectPreceding(BaseToken.class, parent, 1).get(0);
+ end = prevToken.getEnd();
+ }
+ }
+
+ Matcher m = headerPatt.matcher(nodeText);
+ if(m.find()){
+ begin = begin + m.end();
+ }
+
+ Markable markable = new Markable(jCas, begin, end);
+ markable.addToIndexes();
+ }else if(node.getPostag().equals("DT") && !node.getDeprel().equals("det")){
+ Markable markable = new Markable(jCas, node.getBegin(), node.getEnd());
+ markable.addToIndexes();
+ }else if(node.getCoveredText().toLowerCase().equals("it") && node.getDeprel().contains("bj")){
+ // contains "bj" includes nsubj, all the obj's, and all the *bjpass*'s.
+ Markable markable = new Markable(jCas, node.getBegin(), node.getEnd());
+ markable.addToIndexes();
+ }
+ }
+ }
+ }
+
+ // Post-process to remove those kinds of nodes which may or may not be correctly parsed but do not tend to align with gold annotated
+ // markables (and usually our intuitions as well, so it's not completely hacky).
+ private static List<ConllDependencyNode> removeUnannotatedNodes(ConllDependencyNode originalNode,
+ List<ConllDependencyNode> progeny) {
+ List<ConllDependencyNode> filtered = new ArrayList<>();
+
+ for(ConllDependencyNode node: progeny){
+ if(node == originalNode) filtered.add(node);
+
+ boolean blockedByConj = false;
+ for(ConllDependencyNode pathEl : DependencyUtility.getPath(progeny, node, originalNode)){
+ if(pathEl == originalNode) continue;
+ if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",") || pathEl.getDeprel().equals("punct") || pathEl.getDeprel().equals("meta")
+ || pathEl.getCoveredText().matches("(([A-Z][\\.\\:\\)])|(#\\d+)|(\\d+[\\.\\:\\)]))")){
+ blockedByConj = true;
+ break;
+ }
+ }
+ if(!blockedByConj){
+ filtered.add(node);
+ }
+ }
+
+ return filtered;
+ }
+
+ @SuppressWarnings("unused")
+private static void createMarkablesUsingConstituencyTrees(JCas jCas) {
+ // personal pronouns:
+// for(WordToken token : JCasUtil.select(jCas, WordToken.class)){
+// if(token.getPartOfSpeech().startsWith("PRP") ||
+// token.getCoveredText().equalsIgnoreCase("here")){
+// Markable markable = new Markable(jCas, token.getBegin(), token.getEnd());
+// markable.addToIndexes();
+// }
+// }
+
+ // NPs:
+ for(TreebankNode tree : JCasUtil.select(jCas, TreebankNode.class)){
+ if(tree.getNodeType().equals("NP")){
+ String nodeText = tree.getCoveredText();
+ // cases to skip: 1) already included by pos tag above
+ // 2) existential "there"
+ // 3) proper names
+ // 4) numbers
+ if(tree.getChildren().size() == 1){
+ if(tree.getChildren(0).getNodeType().equals("PRP") ||
+ tree.getChildren(0).getNodeType().equals("EX") ||
+ tree.getChildren(0).getNodeType().equals("CD")) {
+ continue;
+ }
+ }
+ Markable markable = null;
+ Matcher m = headerPatt.matcher(nodeText);
+ int start = tree.getBegin();
+ int end = tree.getEnd();
+ if(m.find()){
+ start = start + m.end();
+ }
+ if((nodeText.endsWith(".") || nodeText.endsWith(":")) && end-1 > start){
+ end = end-1;
+ // System.err.println("Adjusting end with pair: (" + start + ", " + end + ")");
+ }
+
+ markable = new Markable(jCas, start, end);
+ markable.addToIndexes();
+
+ // N* modifiers of NPs: (
+ for(int i = 0; i < tree.getChildren().size()-1; i++){
+ TreebankNode child = tree.getChildren(i);
+ if(child instanceof TerminalTreebankNode && child.getNodeType().startsWith("N") && !child.getNodeType().equals("NNP")){
+ markable = new Markable(jCas, child.getBegin(), child.getEnd());
+ markable.addToIndexes();
+ }
+ }
+ }
+ }
+ }
+}
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,819 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.features.AttributeFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.CorefSyntaxFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.DistSemFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.SalienceFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.SectionFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.TemporalFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.features.DependencyPathFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.NamedEntityFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.PartOfSpeechFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.PhraseChunkingExtractor;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.ae.features.TokenFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.FloatArray;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+public class EventCoreferenceAnnotator extends RelationExtractorAnnotator {
+
+ public static final String IDENTITY_RELATION = "Identity";
+
+ public static final int DEFAULT_SENT_DIST = 5;
+ public static final String PARAM_SENT_DIST = "SentenceDistance";
+ @ConfigurationParameter(name = PARAM_SENT_DIST, mandatory = false, description = "Number of sentences allowed between coreferent mentions")
+ private int maxSentDist = DEFAULT_SENT_DIST;
+
+ public static final double DEFAULT_PAR_SIM = 0.5;
+ public static final String PARAM_PAR_SIM = "PararaphSimilarity";
+ @ConfigurationParameter(name = PARAM_PAR_SIM, mandatory = false, description = "Similarity required to pair paragraphs for coreference")
+ private double simThreshold = DEFAULT_PAR_SIM;
+
+ public static final boolean DEFAULT_SCORE_ALL = false;
+ public static final String PARAM_SCORE_ALL = "ScoreAllPairs";
+ @ConfigurationParameter(name = PARAM_SCORE_ALL, mandatory = false, description = "Whether to score all pairs (as in a feature detector")
+ private boolean scoreAll = DEFAULT_SCORE_ALL;
+
+ private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+ private Map<Markable,Set<String>> markableEnts = null;
+ private List<Markable> markablesByConfidence = null;
+ private Map<Annotation,NonEmptyFSList> chains = null;
+ private double lastScore;
+
+
+ private Logger logger = Logger.getLogger(EventCoreferenceAnnotator.class);
+
+ public static AnalysisEngineDescription createDataWriterDescription(
+ Class<? extends DataWriter<String>> dataWriterClass,
+ File outputDirectory,
+ float downsamplingRate) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ EventCoreferenceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ true,
+ RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ downsamplingRate,
+ DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+ dataWriterClass,
+ DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+ outputDirectory);
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription(String modelPath)
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ EventCoreferenceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath);
+ }
+
+ public static AnalysisEngineDescription createScoringAnnotatorDescription(String modelPath)
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ EventCoreferenceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath,
+ EventCoreferenceAnnotator.PARAM_SCORE_ALL,
+ true);
+ }
+
+ @Override
+ protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+ List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> featureExtractorList = new ArrayList<>();
+
+ // pick and choose from base class:
+ featureExtractorList.add(new TokenFeaturesExtractor());
+ featureExtractorList.add(new PartOfSpeechFeaturesExtractor());
+ featureExtractorList.add(new PhraseChunkingExtractor());
+// featureExtractorList.add(new NamedEntityFeaturesExtractor()); // same features in UMLSFeatureExtractor
+ featureExtractorList.add(new DependencyTreeFeaturesExtractor());
+// featureExtractorList.add(new DependencyPathFeaturesExtractor()); // not in mention-cluster version
+
+// featureList.add(new DistanceFeatureExtractor());
+ featureExtractorList.add(new StringMatchingFeatureExtractor());
+ featureExtractorList.add(new TokenFeatureExtractor()); // agreement features
+ featureExtractorList.add(new SectionFeatureExtractor());
+ featureExtractorList.add(new UMLSFeatureExtractor());
+ featureExtractorList.add(new CorefSyntaxFeatureExtractor()); // dep head feature
+ featureExtractorList.add(new TemporalFeatureExtractor());
+
+ // added for feature parity with cluster version:
+ featureExtractorList.add(new SalienceFeatureExtractor());
+ featureExtractorList.add(new AttributeFeatureExtractor());
+
+// featureExtractorList.add(new ChainStackFeatureExtractor());
+
+// featureExtractorList.add(new DocumentStructureTreeExtractor());
+ try{
+ featureExtractorList.add(new DistSemFeatureExtractor());
+ }catch(IOException e){
+ e.printStackTrace();
+ }
+
+ return featureExtractorList;
+ }
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+ if(this.isTraining() && JCasUtil.select(jCas, CoreferenceRelation.class).size() == 0){
+ logger.debug("Skipping document with no gold standard coreference relations.");
+ return;
+ }
+ numClassifications = 0;
+ nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+ markableEnts = new HashMap<>();
+ chains = new HashMap<>();
+ markablesByConfidence = new ArrayList<>(JCasUtil.select(jCas, Markable.class));
+ Collections.sort(markablesByConfidence, new MarkableConfidenceComparator());
+ for(Markable m : markablesByConfidence){
+ markableEnts.put(m, getBestEnt(jCas, m));
+ }
+ super.process(jCas);
+ if(!this.isTraining() && !this.scoreAll){
+ for(NonEmptyFSList chainHead : new HashSet<>(chains.values())){
+ CollectionTextRelation chain = new CollectionTextRelation(jCas);
+ chain.setMembers(chainHead);
+
+ NonEmptyFSList cur = chainHead;
+ while(cur.getTail() != null){
+ cur = (NonEmptyFSList) cur.getTail();
+ }
+ EmptyFSList tail = new EmptyFSList(jCas);
+ tail.addToIndexes();
+ cur.setTail(tail);
+
+ chain.addToIndexes();
+ }
+ }
+ logger.debug("This document had : " + numClassifications + " pair classifications");
+ foundAnaphors.clear();
+ chains.clear();
+ }
+
+ @Override
+ public void collectionProcessComplete() throws AnalysisEngineProcessException {
+ super.collectionProcessComplete();
+ }
+
+ @Override
+ protected Iterable<IdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+ JCas jcas, Annotation segment) {
+
+ return new PairIterable(jcas, segment);
+ }
+
+ public List<IdentifiedAnnotationPair> getClosePairs(JCas jcas, Annotation segment, double confidence){
+ List<Markable> markables = new ArrayList<>(JCasUtil.select(jcas, Markable.class));
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ for(int i = 1; i < markables.size(); i++){
+ Markable ana = markables.get(i);
+ // only look at anaphors w/in this segment:
+ if(!dominates(segment, ana)){
+ continue;
+ }
+ Set<String> bestAnaTypes = getBestEnt(jcas, ana);
+
+ for(int j = i-1; j >= 0; j--){
+ Markable ante = markables.get(j);
+ if(ante.getConfidence() < confidence){
+ continue;
+ }
+
+ // check sentence distance unless this is an anatomical site or medication
+ if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+ bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+ int sentdist = sentDist(jcas, ante, ana);
+ if(sentdist > maxSentDist) break;
+ }
+
+ Set<String> bestAnteTypes = getBestEnt(jcas, ante);
+
+ // if they both have entity types we need to make sure they match
+ // -- if neither has a sem type or only one is tagged we can let them
+ // try to match.
+ if(bestAnaTypes.size() > 0 && bestAnteTypes.size() > 0){
+ boolean overlap = false;
+ for(String semType : bestAnaTypes){
+ if(bestAnteTypes.contains(semType)){
+ overlap = true;
+ }
+ }
+ // they both correspond to named entities but no overlap in which category of named entity.
+ if(!overlap){
+ continue;
+ }
+ }
+ pairs.add(new IdentifiedAnnotationPair(ante, ana));
+ }
+ }
+ return pairs;
+ }
+
+ public Set<String> getBestEnt(JCas jcas, Markable markable){
+ if(markableEnts.containsKey(markable)) return markableEnts.get(markable);
+// markableEnts.put(markable, new HashSet<String>());
+ Set<String> bestEnts = new HashSet<>();
+ IdentifiedAnnotation bestEnt = null;
+ Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+ ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+ Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+ for(IdentifiedAnnotation ent : coveringEnts){
+ if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+ ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+ if(entHead == head){
+ if(bestEnt == null){
+ bestEnt = ent;
+ }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+ // if the span of this entity is bigger than the biggest existing one:
+ bestEnt = ent;
+ otherBestEnts = new HashSet<>();
+ }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+ // there is another one with the exact same span and possibly different type!
+ otherBestEnts.add(ent);
+ }
+ }
+ }
+
+ if(bestEnt!=null){
+ bestEnts.add(bestEnt.getClass().getSimpleName());
+// markableEnts.get(markable).add(bestEnt.getClass().getSimpleName());
+ for(IdentifiedAnnotation other : otherBestEnts){
+ bestEnts.add(other.getClass().getSimpleName());
+// markableEnts.get(markable).add(other.getClass().getSimpleName());
+ }
+ }
+ return bestEnts;
+// return markableEnts.get(markable);
+ }
+ public static boolean dominates(Annotation arg1, Annotation arg2) {
+ return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+ }
+
+ public List<IdentifiedAnnotationPair> getParagraphPairs(JCas jcas, Annotation segment){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+ // CODE FOR PARAGRAPH-BASED MATCHING
+ List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+ double[][] sims = new double[pars.size()][pars.size()];
+ for(int i = 0; i < sims.length; i++){
+ Arrays.fill(sims[i], 0.0);
+ }
+
+ for(int i = 0; i < pars.size(); i++){
+ // get all pairs within this paragraph
+ List<Markable> curParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(i));
+ for(int anaId = 1; anaId < curParMarkables.size(); anaId++){
+ for(int anteId = anaId-1; anteId >= 0; anteId--){
+ Markable ana = curParMarkables.get(anaId);
+ Markable ante = curParMarkables.get(anteId);
+ int sentdist = sentDist(jcas, ante, ana);
+ if(sentdist > maxSentDist) break;
+ pairs.add(new IdentifiedAnnotationPair(ante, ana));
+ }
+ }
+ }
+ return pairs;
+ }
+
+ public List<IdentifiedAnnotationPair> getSimilarPairs(JCas jcas, Annotation segment){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ FSArray parVecs = JCasUtil.selectSingle(jcas, FSArray.class);
+
+ List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+ double[][] sims = new double[pars.size()][pars.size()];
+ for(int i = 0; i < sims.length; i++){
+ Arrays.fill(sims[i], 0.0);
+ }
+
+ for(int i = 0; i < pars.size(); i++){
+ List<Markable> curParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(i));
+ FloatArray parVec = (FloatArray) parVecs.get(i);
+ for(int j = i-1; j >= 0; j--){
+ if(sims[i][j] == 0.0){
+ // compute the sim explicitly
+ FloatArray prevParVec = (FloatArray) parVecs.get(j);
+ sims[i][j] = calculateSimilarity(parVec, prevParVec);
+ }
+
+ if(sims[i][j] > simThreshold){
+ // pair up all markables in each paragraph
+ List<Markable> prevParMarkables = JCasUtil.selectCovered(Markable.class, pars.get(j));
+ for(int anaId = 0; anaId < curParMarkables.size(); anaId++){
+ for(int anteId = prevParMarkables.size()-1; anteId >= 0; anteId--){
+ Markable ana = curParMarkables.get(anaId);
+ Markable ante = prevParMarkables.get(anteId);
+ int sentdist = sentDist(jcas, ante, ana);
+ if(sentdist > maxSentDist) break;
+ pairs.add(new IdentifiedAnnotationPair(ante, ana));
+ }
+ }
+ }
+ }
+ }
+ return pairs;
+ }
+
+ public List<IdentifiedAnnotationPair> getConfidentPairs(JCas jcas, Annotation segment, double threshold){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ List<Markable> anas = JCasUtil.selectCovered(Markable.class, segment);
+
+ for(Markable ana : anas){
+ for(Markable ante : markablesByConfidence){
+ // if we are into the unconfident
+ if(ante.getConfidence() < threshold){
+ break;
+ }
+
+ // if the candidate antecedent is after the anafor skip it.
+ if(ante.getBegin() > ana.getBegin() && ante.getEnd() > ante.getEnd()){
+ continue;
+ }
+
+ // if the anaphor has a sem type make sure the ante matches it
+ boolean match = false;
+ if(markableEnts.get(ana).size() > 0){
+ if(markableEnts.get(ante).size() == 0){
+ match = true;
+ }else{
+ for(String semType : markableEnts.get(ana)){
+ if(markableEnts.get(ante).contains(semType)){
+ match = true;
+ break;
+ }
+ }
+ }
+ }else{
+ match = true;
+ }
+
+ if(match){
+ pairs.add(new IdentifiedAnnotationPair(ante, ana));
+ }
+ }
+ }
+ return pairs;
+ }
+ /*
+ * Markables that are in a section header are highly salient and prime candidates
+ * as antecedents in coreference. We detect headers as sentences that are the only sentence in a paragraph.
+ * This is probably high recall with some precision hits but thats ok for now.
+ */
+ public List<IdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, Annotation segment, double confidence){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ List<Markable> markables = JCasUtil.selectCovered(jcas, Markable.class, segment);
+ for(int i = 0; i < markables.size(); i++){
+ IdentifiedAnnotation ana = markables.get(i);
+ List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, ana.getBegin());
+ for(int j = 0; j < pars.size(); j++){
+ Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+ List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+ if(coveredSents != null && coveredSents.size() == 1){
+ for(Markable anteCandidate : JCasUtil.selectCovered(jcas, Markable.class, par)){
+ if(anteCandidate.getConfidence() > confidence){
+ pairs.add(new IdentifiedAnnotationPair(anteCandidate, ana));
+ }
+ }
+ }
+ }
+ }
+ return pairs;
+ }
+
+ public List<IdentifiedAnnotationPair> getAlreadyLinkedPairs(JCas jcas, Annotation segment){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ List<Markable> segMarkables = new ArrayList<>(JCasUtil.selectCovered(jcas, Markable.class, segment));
+
+ // if we are testing, there are no chains in the cas yet so we have to look at the
+ // intermediate data structures we use.
+ for(int i = 0; i < segMarkables.size(); i++){
+ Markable ana = segMarkables.get(i);
+// if(this.isTraining()){
+ for(CollectionTextRelation chain : JCasUtil.select(jcas, CollectionTextRelation.class)){
+ FSList head = chain.getMembers();
+ Markable last = null;
+ while(head instanceof NonEmptyFSList){
+ Markable m = (Markable) ((NonEmptyFSList)head).getHead();
+
+ // ignore markables past the current anaphor or equal to it
+ if(m == null || m.getEnd() > ana.getEnd()){
+ break;
+ }
+ if(!(m.getBegin() == ana.getBegin() && m.getEnd() == ana.getEnd())){
+ last = m;
+ }
+ head = ((NonEmptyFSList)head).getTail();
+ }
+ if(last != null){
+ pairs.add(new IdentifiedAnnotationPair(last, ana));
+ }
+ }
+// }
+ }
+
+ return pairs;
+ }
+
+ public List<IdentifiedAnnotationPair> getHeadwordMatchingPairs(JCas jcas, Annotation segment){
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ List<Markable> segMarkables = new ArrayList<>(JCasUtil.selectCovered(jcas, Markable.class, segment));
+ for(int i = 0; i < segMarkables.size(); i++){
+ Markable ana = segMarkables.get(i);
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, ana);
+ String headword = null;
+ if(headNode != null){
+ headword = headNode.getCoveredText().toLowerCase();
+ }else{
+ continue;
+ }
+ List<Markable> previousMarkables = JCasUtil.selectCovered(jcas, Markable.class, 0, ana.getBegin());
+ for(int j = 0; j < previousMarkables.size(); j++){
+ Markable ante = previousMarkables.get(j);
+ ConllDependencyNode anteNode = DependencyUtility.getNominalHeadNode(jcas, ante);
+ if(anteNode != null){
+ String anteHeadword = anteNode.getCoveredText().toLowerCase();
+ if(headword.equals(anteHeadword)){
+ pairs.add(new IdentifiedAnnotationPair(ante, ana));
+ }
+ }
+ }
+ }
+ return pairs;
+ }
+
+ @Override
+ protected String classify(List<Feature> features)
+ throws CleartkProcessingException {
+ numClassifications++;
+
+ String category = super.classifier.classify(features);
+
+
+ if(this.scoreAll){
+ Map<String,Double> scores = super.classifier.score(features);
+ this.lastScore = scores.get(IDENTITY_RELATION);
+ category = IDENTITY_RELATION;
+ }
+ return category;
+ }
+
+ @Override
+ protected Class<? extends Annotation> getCoveringClass() {
+ return Segment.class;
+ }
+
+ @Override
+ protected Class<? extends BinaryTextRelation> getRelationClass() {
+ return CoreferenceRelation.class;
+ }
+
+ protected HashSet<IdentifiedAnnotation> foundAnaphors = new HashSet<>();
+ int numClassifications = 0;
+
+ @Override
+ protected void createRelation(
+ JCas jCas,
+ IdentifiedAnnotation ante,
+ IdentifiedAnnotation ana,
+ String predictedCategory) {
+ if(this.scoreAll){
+ // do this first -- if we need to score all pairs then it doesn't really make sense to talk about
+ // "found anaphors" since we're not in finding mode.
+ CoreferenceRelation relation = buildRelation(jCas, ante, ana, predictedCategory);
+ relation.setConfidence(this.lastScore);
+ relation.addToIndexes();
+ } // check if its already been linked
+ else if(!foundAnaphors.contains(ana)){
+ // add the relation to the CAS
+ CoreferenceRelation relation = buildRelation(jCas, ante, ana, predictedCategory);
+ relation.addToIndexes();
+ foundAnaphors.add(ana);
+ if(!chains.containsKey(ante)){
+ // new chain
+ NonEmptyFSList anteEl = new NonEmptyFSList(jCas);
+ NonEmptyFSList anaEl = new NonEmptyFSList(jCas);
+ anteEl.setHead(ante);
+ anaEl.setHead(ana);
+ anteEl.setTail(anaEl);
+ anaEl.setTail(null);
+ chains.put(ante, anteEl);
+ chains.put(ana, anteEl);
+ anaEl.addToIndexes();
+ anteEl.addToIndexes();
+ }else{
+ NonEmptyFSList oldChain = chains.get(ante);
+ NonEmptyFSList chainEnd = oldChain;
+ NonEmptyFSList anaEl = new NonEmptyFSList(jCas);
+ anaEl.setHead(ana);
+ anaEl.setTail(null);
+
+ while(chainEnd.getTail() != null){
+ chainEnd = (NonEmptyFSList) chainEnd.getTail();
+ }
+
+ chainEnd.setTail(anaEl);
+ chains.put(ana, oldChain);
+ anaEl.addToIndexes();
+ }
+ }else{
+ logger.error("Greedy coreference resolution violated -- anaphor linked to two candidate antecedents!");
+ }
+ }
+
+ private CoreferenceRelation buildRelation(JCas jCas, Annotation ante, Annotation ana, String predictedCategory){
+ RelationArgument relArg1 = new RelationArgument(jCas);
+ relArg1.setArgument(ante);
+ relArg1.setRole("Antecedent");
+ relArg1.addToIndexes();
+ RelationArgument relArg2 = new RelationArgument(jCas);
+ relArg2.setArgument(ana);
+ relArg2.setRole("Anaphor");
+ relArg2.addToIndexes();
+ CoreferenceRelation relation = new CoreferenceRelation(jCas);
+ relation.setArg1(relArg1);
+ relation.setArg2(relArg2);
+ relation.setCategory(predictedCategory);
+ return relation;
+ }
+
+ @Override
+ protected String getRelationCategory(
+ Map<List<Annotation>, BinaryTextRelation> relationLookup,
+ IdentifiedAnnotation ante, IdentifiedAnnotation ana) {
+ String cat = super.getRelationCategory(relationLookup, ante, ana);
+ int dist = sentsBetween(ante, ana);
+
+ if(cat != null && !cat.equals(NO_RELATION_CATEGORY)){
+ // cat is some coref category
+ foundAnaphors.add(ana);
+ logger.info(String.format("DISTSALIENCE: (%d,%f,1)\n", dist, ante.getConfidence()));
+ }else{
+ // sample 10 percent of negative examples:
+ if(Math.random() < 0.1){
+ logger.info(String.format("DISTSALIENCE: (%d,%f,0)\n", dist, ante.getConfidence()));
+ }
+ }
+ return cat;
+ }
+
+ public static int sentDist(JCas jcas, IdentifiedAnnotation arg1,
+ IdentifiedAnnotation arg2){
+ return JCasUtil.selectCovered(jcas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size();
+ }
+
+ public static int sentsBetween(IdentifiedAnnotation arg1,
+ IdentifiedAnnotation arg2) {
+ Collection<Sentence> sents = JCasUtil.selectBetween(Sentence.class, arg1, arg2);
+ return sents.size();
+ }
+
+ private static double calculateSimilarity(FloatArray f1, FloatArray f2){
+ double sim = 0.0f;
+ double f1len = 0.0;
+ double f2len = 0.0;
+
+ for(int i = 0; i < f1.size(); i++){
+ sim += (f1.get(i) * f2.get(i));
+ f1len += (f1.get(i) * f1.get(i));
+ f2len += (f2.get(i) * f2.get(i));
+ }
+ f1len = Math.sqrt(f1len);
+ f2len = Math.sqrt(f2len);
+ sim = sim / (f1len * f2len);
+
+ return sim;
+ }
+
+ class PairIterable implements Iterable<IdentifiedAnnotationPair> {
+
+ PairIterator iter = null;
+
+ public PairIterable(JCas jcas, Annotation segment){
+ iter = new PairIterator(jcas, segment);
+ }
+
+ @Override
+ public Iterator<IdentifiedAnnotationPair> iterator() {
+ return iter;
+ }
+
+ }
+
+ class PairIterator implements Iterator<IdentifiedAnnotationPair> {
+
+ JCas jcas = null;
+ Annotation segment = null;
+ // need 2 passes -- first for preliminary pairs, then for linking to
+ // existing chains - could bee FIXME'd by creating uima chains as we go instead
+ // of using placeholder chains but that is substantially more complicated.
+ List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+ List<IdentifiedAnnotationPair> pass2Pairs = null;
+ IdentifiedAnnotationPair next = null;
+
+ public PairIterator(JCas jcas, Annotation segment) {
+ this.jcas = jcas;
+ this.segment = segment;
+
+ pairs.addAll(getClosePairs(jcas, segment, 0.0));
+ pairs.addAll(getSectionHeaderPairs(jcas, segment, 0.0));
+ pairs.addAll(getAlreadyLinkedPairs(jcas, segment));
+ pairs.addAll(getHeadwordMatchingPairs(jcas, segment));
+//
+// pairs.addAll(getConfidentPairs(jcas, segment, 0.25));
+// if(!isTraining()){
+// Collections.sort(pairs, new MarkableConfidenceComparator());
+// Collections.sort(pairs, new IdentifiedAnnotationPairComparator());
+// }
+ }
+
+ @Override
+ public boolean hasNext() {
+ while(pairs.size() > 0){
+ next = pairs.remove(0);
+ IdentifiedAnnotation ante = next.getArg1();
+ IdentifiedAnnotation ana = next.getArg2();
+ if(dominates(ante, ana) || dominates(ana,ante)) continue;
+ if(!foundAnaphors.contains(ana)){
+ return true;
+ }
+ }
+
+ if(pass2Pairs == null){
+ pass2Pairs = new ArrayList<>();
+// pass2Pairs.addAll(getAlreadyLinkedPairs(this.jcas, this.segment));
+ }
+
+ while(pass2Pairs.size() > 0){
+ next = pass2Pairs.remove(0);
+ IdentifiedAnnotation ante = next.getArg1();
+ IdentifiedAnnotation ana = next.getArg2();
+ if(dominates(ante, ana) || dominates(ana,ante)) continue;
+ if(!foundAnaphors.contains(ana)){
+ return true;
+ }
+ }
+
+ return false; // if we get this far then there were no good candidates
+ }
+
+ @Override
+ public IdentifiedAnnotationPair next() {
+ numClassifications++;
+ return next;
+ }
+
+ @Override
+ public void remove() {
+ // Optional implementation
+ }
+
+ }
+
+ public class MarkablePairConfidenceComparator implements
+ Comparator<IdentifiedAnnotationPair> {
+
+ public int compare(IdentifiedAnnotationPair o1, IdentifiedAnnotationPair o2) {
+ if(o1 == o2) return 0;
+ int sim;
+ IdentifiedAnnotation ante1 = o1.getArg1();
+ IdentifiedAnnotation ante2 = o2.getArg1();
+ IdentifiedAnnotation ana1 = o1.getArg2();
+ IdentifiedAnnotation ana2 = o2.getArg2();
+
+ // first level sorting is by anaphor:
+ if(ana1.getBegin() != ana2.getBegin()){
+ sim = ana1.getBegin() - ana2.getBegin() > 0 ? 1 : -1;
+ }else if(ana1.getEnd() != ana2.getEnd()){
+ sim = ana1.getEnd() - ana2.getEnd() > 0 ? 1 : -1;
+ }else{
+ // sort by antecedent
+ if(ante1.getConfidence() > ante2.getConfidence()){
+ sim = -1;
+ }else if(ante1.getConfidence() < ante2.getConfidence()){
+ sim = 1;
+ }else{
+ sim = 0;
+ }
+ }
+
+ return sim;
+ }
+
+ }
+
+ public class MarkableConfidenceComparator implements Comparator<Markable> {
+ public int compare(Markable m1, Markable m2){
+ if(m1 == m2) return 0;
+ if(m1.getConfidence() > m2.getConfidence()){
+ return -1;
+ }else if(m1.getConfidence() < m2.getConfidence()){
+ return 1;
+ }else{
+ return 0;
+ }
+ }
+ }
+
+ public class IdentifiedAnnotationPairComparator implements Comparator<IdentifiedAnnotationPair> {
+
+ public int compare(IdentifiedAnnotationPair o1, IdentifiedAnnotationPair o2) {
+ if(o1 == o2) return 0;
+ int sim;
+ IdentifiedAnnotation ante1 = o1.getArg1();
+ IdentifiedAnnotation ante2 = o2.getArg1();
+ IdentifiedAnnotation ana1 = o1.getArg2();
+ IdentifiedAnnotation ana2 = o2.getArg2();
+
+ // first level sorting is by anaphor:
+ if(ana1.getBegin() != ana2.getBegin()){
+ sim = ana1.getBegin() - ana2.getBegin() > 0 ? 1 : -1;
+ }else if(ana1.getEnd() != ana2.getEnd()){
+ sim = ana1.getEnd() - ana2.getEnd() > 0 ? 1 : -1;
+ }else if(ante1.getBegin() != ante2.getBegin()){
+ sim = ante1.getBegin() - ante2.getBegin() > 0 ? 1 : -1;
+ }else if(ante1.getEnd() != ante2.getEnd()){
+ sim = ante1.getEnd() - ante2.getEnd() > 0 ? 1 : -1;
+ }else{
+ sim = 0;
+ }
+ return sim;
+ }
+
+ }
+
+ private class AnnotationComparator implements Comparator<Annotation> {
+
+ public AnnotationComparator() {
+ }
+
+ @Override
+ public int compare(Annotation o1, Annotation o2) {
+ if(o1.getBegin() < o2.getBegin()){
+ return -1;
+ }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() < o2.getEnd()){
+ return -1;
+ }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() > o2.getEnd()){
+ return 1;
+ }else if(o2.getBegin() < o1.getBegin()){
+ return 1;
+ }else{
+ return 0;
+ }
+ }
+ }
+
+}
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,82 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+
+public class MarkableHeadTreeCreator extends JCasAnnotator_ImplBase {
+
+ private static final String MAP_KEY = "MarkableHeadMap";
+
+ private static final Logger logger = Logger.getLogger(MarkableHeadTreeCreator.class);
+
+ @Override
+ public void initialize(UimaContext context) throws ResourceInitializationException {
+ super.initialize(context);
+
+ }
+
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ Map<Markable,ConllDependencyNode> treeMap = MapFactory.createInstance(getKey(jcas));
+
+ for(Markable m: JCasUtil.select(jcas, Markable.class)){
+ ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, m);
+ treeMap.put(m, headNode);
+// markable2head.put(m, headNode);
+ }
+ }
+
+ public static String getKey(JCas jcas){
+ String docId = null;
+ try{
+ docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+ }catch(Exception e){
+ e.printStackTrace();
+ }
+ if(docId == null || docId == DocumentIDAnnotationUtil.NO_DOCUMENT_ID){
+ try {
+ docId = ViewUriUtil.getURI(jcas).toString();
+ } catch (AnalysisEngineProcessException e) {
+ e.printStackTrace();
+ logger.warn("No document ID found using traditional methods. Using ad hoc combination");
+ String docText = jcas.getDocumentText();
+ docId = docText.substring(0, Math.min(20, docText.length())) + "_hash=" + docText.hashCode();
+ }
+ }
+ return docId + "-" + MAP_KEY;
+ }
+
+ public static class MarkableDepheadPairComparator implements Comparator<Markable> {
+
+ @Override
+ public int compare(Markable m1, Markable m2) {
+ // look at the start first
+ if(m1.getBegin() < m2.getBegin()){
+ return -1;
+ }else if(m2.getBegin() < m1.getBegin()){
+ return 1;
+ }else if(m1.getEnd() < m2.getEnd()){
+ return -1;
+ }else if(m2.getEnd() < m1.getEnd()){
+ return 1;
+ }else{
+ // m1 and m2 have the exact same span
+ return 0;
+ }
+ }
+ }
+}
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,87 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.coreference.ae.features.salience.ClinicalFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.GrammaticalRoleFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.MorphosyntacticFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.salience.SemanticEnvironmentFeatureExtractor;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+public class MarkableSalienceAnnotator extends CleartkAnnotator<Boolean> {
+
+ List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+
+ public static AnalysisEngineDescription createDataWriterDescription(
+ Class<? extends DataWriter<Boolean>> dataWriterClass,
+ File outputDirectory) throws ResourceInitializationException{
+ return AnalysisEngineFactory.createEngineDescription(
+ MarkableSalienceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ true,
+ DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+ dataWriterClass,
+ DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+ outputDirectory);
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription(String modelPath) throws ResourceInitializationException{
+ return AnalysisEngineFactory.createEngineDescription(
+ MarkableSalienceAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath);
+ }
+
+ @Override
+ public void initialize(UimaContext context)
+ throws ResourceInitializationException {
+ super.initialize(context);
+
+ extractors.add(new MorphosyntacticFeatureExtractor());
+ extractors.add(new GrammaticalRoleFeatureExtractor());
+ extractors.add(new SemanticEnvironmentFeatureExtractor());
+ extractors.add(new ClinicalFeatureExtractor());
+ }
+
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+
+ for(Markable markable : JCasUtil.select(jcas, Markable.class)){
+ boolean outcome;
+ List<Feature> features = new ArrayList<>();
+ for(FeatureExtractor1<Markable> extractor : extractors){
+ features.addAll(extractor.extract(jcas, markable));
+ }
+ Instance<Boolean> instance = new Instance<>(features);
+
+ if(this.isTraining()){
+ outcome = markable.getConfidence() > 0.5;
+ instance.setOutcome(outcome);
+ this.dataWriter.write(instance);
+ }else{
+ Map<Boolean,Double> outcomes = this.classifier.score(features);
+ markable.setConfidence(outcomes.get(true).floatValue());
+ }
+ }
+ }
+}