You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/02/01 20:37:16 UTC
svn commit: r1727990 - in /ctakes/trunk:
ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/
ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/
ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/
Author: tmill
Date: Mon Feb 1 19:37:16 2016
New Revision: 1727990
URL: http://svn.apache.org/viewvc?rev=1727990&view=rev
Log:
Uima-fitized old coreference pipeline.
Added:
ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/cogVeds.txt
ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/modalAdjs.txt
ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/otherVerbs.txt
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java
Modified:
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableCreator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableExpander.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkablePairGenerator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AbstractClassifier.java
ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AnnotationSelector.java
Added: ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/cogVeds.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/cogVeds.txt?rev=1727990&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/cogVeds.txt (added)
+++ ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/cogVeds.txt Mon Feb 1 19:37:16 2016
@@ -0,0 +1,8 @@
+recommended
+thought
+believed
+known
+anticipated
+assumed
+expected
+noted
\ No newline at end of file
Added: ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/modalAdjs.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/modalAdjs.txt?rev=1727990&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/modalAdjs.txt (added)
+++ ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/modalAdjs.txt Mon Feb 1 19:37:16 2016
@@ -0,0 +1,16 @@
+necessary
+good
+economical
+possible
+useful
+easy
+certain
+advisable
+desirable
+likely
+convenient
+difficult
+important
+sufficient
+legal
+worthwhile
\ No newline at end of file
Added: ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/otherVerbs.txt
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/otherVerbs.txt?rev=1727990&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/otherVerbs.txt (added)
+++ ctakes/trunk/ctakes-coreference-res/src/main/resources/org/apache/ctakes/coreference/otherVerbs.txt Mon Feb 1 19:37:16 2016
@@ -0,0 +1,4 @@
+seem
+appear
+mean
+follow
\ No newline at end of file
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableCreator.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableCreator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableCreator.java Mon Feb 1 19:37:16 2016
@@ -18,30 +18,45 @@
*/
package org.apache.ctakes.coreference.ae;
+import java.io.File;
+import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.Scanner;
+import java.util.Set;
+import org.apache.ctakes.coreference.type.DemMarkable;
+import org.apache.ctakes.coreference.type.NEMarkable;
+import org.apache.ctakes.coreference.type.PronounMarkable;
+import org.apache.ctakes.coreference.util.AnnotationSelector;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.ctakes.coreference.util.AnnotationSelector;
-import org.apache.ctakes.typesystem.type.syntax.Chunk;
-import org.apache.ctakes.typesystem.type.syntax.WordToken;
-import org.apache.ctakes.coreference.type.DemMarkable;
-import org.apache.ctakes.coreference.type.NEMarkable;
-import org.apache.ctakes.coreference.type.PronounMarkable;
-
public class MipacqMarkableCreator extends JCasAnnotator_ImplBase {
public static int nextID = 0;
- HashSet<String> modalAdj;
- HashSet<String> cogved;
- HashSet<String> otherVerb;
+ public static final String PARAM_MODAL_ADJ = "modalAdj";
+ @ConfigurationParameter(name = PARAM_MODAL_ADJ, mandatory=false, defaultValue="org/apache/ctakes/coreference/modalAdjs.txt")
+ File modalAdjFile = null;
+ Set<String> modalAdj;
+
+ public static final String PARAM_COGVED = "cogVeds";
+ @ConfigurationParameter(name = PARAM_COGVED, mandatory=false, defaultValue="org/apache/ctakes/coreference/cogVeds.txt")
+ File cogvedFile = null;
+ Set<String> cogved;
+
+ public static final String PARAM_OTHER_VERB = "otherVerbs";
+ @ConfigurationParameter(name = PARAM_OTHER_VERB, mandatory=false, defaultValue="org/apache/ctakes/coreference/otherVerbs.txt")
+ File otherVerbFile=null;
+ Set<String> otherVerb;
// LOG4J logger based on class name
private Logger logger = Logger.getLogger(getClass().getName());
@@ -51,17 +66,26 @@ public class MipacqMarkableCreator exten
super.initialize(uc);
// Load modal adjectives and cognitive verbs for pleonastic patterns
- String[] ma = (String[]) uc.getConfigParameterValue("modalAdj");
- modalAdj = new HashSet<String>();
- for (String s : ma) modalAdj.add(s);
- String[] cv = (String[]) uc.getConfigParameterValue("cogved");
- cogved = new HashSet<String>();
- for (String s : cv) cogved.add(s);
- String[] ov = (String[]) uc.getConfigParameterValue("otherVerb");
- otherVerb = new HashSet<String>();
- for (String s : ov) otherVerb.add(s);
+ try{
+ modalAdj = readWordlistFile(modalAdjFile);
+ cogved = readWordlistFile(cogvedFile);
+ otherVerb = readWordlistFile(otherVerbFile);
+ }catch(FileNotFoundException e){
+ throw new ResourceInitializationException(e);
+ }
}
+ private static final Set<String> readWordlistFile(File inputFile) throws FileNotFoundException{
+ HashSet<String> words = new HashSet<>();
+ try(Scanner scanner = new Scanner(inputFile)){
+ while(scanner.hasNextLine()){
+ String line = scanner.nextLine().trim();
+ words.add(line);
+ }
+ }
+ return words;
+ }
+
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableExpander.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableExpander.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableExpander.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkableExpander.java Mon Feb 1 19:37:16 2016
@@ -24,21 +24,18 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.ctakes.coreference.type.DemMarkable;
+import org.apache.ctakes.coreference.type.Markable;
+import org.apache.ctakes.coreference.type.NEMarkable;
+import org.apache.ctakes.coreference.util.FSIteratorToList;
+import org.apache.ctakes.coreference.util.MarkableTreeUtils;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.ctakes.coreference.eval.helpers.Span;
-import org.apache.ctakes.coreference.util.FSIteratorToList;
-import org.apache.ctakes.coreference.util.MarkableTreeUtils;
-import org.apache.ctakes.typesystem.type.syntax.Chunk;
-import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
-import org.apache.ctakes.coreference.type.DemMarkable;
-import org.apache.ctakes.coreference.type.Markable;
-import org.apache.ctakes.coreference.type.NEMarkable;
-
public class MipacqMarkableExpander extends JCasAnnotator_ImplBase {
@Override
@@ -50,12 +47,13 @@ public class MipacqMarkableExpander exte
mergeNP(aJCas);
elevateAdjectives(aJCas);
iter = aJCas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator();
- rmDup(aJCas, FSIteratorToList.convert(iter));
+ rmDup(FSIteratorToList.convert(iter));
}
- private void removeDoctors(JCas jCas) {
+ /*
+ private static void removeDoctors(JCas jCas) {
FSIterator<Annotation> iter = jCas.getAnnotationIndex(NEMarkable.type).iterator();
- ArrayList<Annotation> rm = new ArrayList<Annotation>();
+ ArrayList<Annotation> rm = new ArrayList<>();
while(iter.hasNext()){
NEMarkable m = (NEMarkable) iter.next();
if(m.getCoveredText().equalsIgnoreCase("dr")){
@@ -65,11 +63,11 @@ public class MipacqMarkableExpander exte
for(Annotation a: rm){
a.removeFromIndexes();
}
- }
+ }*/
- private void removeHistoryOf(JCas jCas) {
+ private static void removeHistoryOf(JCas jCas) {
FSIterator<Annotation> iter = jCas.getAnnotationIndex(NEMarkable.type).iterator();
- ArrayList<Annotation> rm = new ArrayList<Annotation>();
+ ArrayList<Annotation> rm = new ArrayList<>();
while(iter.hasNext()){
NEMarkable m = (NEMarkable) iter.next();
if(m.getCoveredText().equalsIgnoreCase("history of")){
@@ -81,10 +79,10 @@ public class MipacqMarkableExpander exte
}
}
- private void expandToNP (JCas aJCas, LinkedList<Annotation> markables) {
+ private static void expandToNP (JCas aJCas, LinkedList<Annotation> markables) {
// FSIterator<Annotation> iter = aJCas.getJFSIndexRepository().getAnnotationIndex(LookupWindowAnnotation.type).iterator();
- FSIterator<Annotation> iter = aJCas.getAnnotationIndex(TreebankNode.type).iterator();
- LinkedList<Annotation> l = FSIteratorToList.convert(iter);
+// FSIterator<Annotation> iter = aJCas.getAnnotationIndex(TreebankNode.type).iterator();
+// LinkedList<Annotation> l = FSIteratorToList.convert(iter);
for (Annotation m : markables){
TreebankNode node = MarkableTreeUtils.markableNode(aJCas, m.getBegin(), m.getEnd());
@@ -116,7 +114,8 @@ public class MipacqMarkableExpander exte
// are any of the named entities contained within this chunk?
// if so return the first that is.
- private Annotation containsAny (Chunk c, LinkedList<Annotation> l) {
+ /*
+ private static Annotation containsAny (Chunk c, LinkedList<Annotation> l) {
int a = c.getBegin();
int b = c.getEnd();
for (Annotation ne : l)
@@ -126,19 +125,20 @@ public class MipacqMarkableExpander exte
return null;
return null;
}
+ */
// merge NP# -> NP' PP, where NP' is marked as a Markable, by making NP# a markable
- private void mergeNP (JCas jcas) {
+ private static void mergeNP (JCas jcas) {
Map<Integer,TreebankNode> innerMap = null;
// mark the boundaries of every NP:
FSIterator<Annotation> nodeIter = jcas.getAnnotationIndex(TreebankNode.type).iterator();
- HashMap<Integer,Map<Integer,TreebankNode>> npMap = new HashMap<Integer,Map<Integer,TreebankNode>>();
+ HashMap<Integer,Map<Integer,TreebankNode>> npMap = new HashMap<>();
while(nodeIter.hasNext()){
TreebankNode node = (TreebankNode) nodeIter.next();
if(node.getNodeType().equals("NP")){
innerMap = npMap.get(node.getBegin());
if(innerMap == null){
- innerMap = new HashMap<Integer,TreebankNode>();
+ innerMap = new HashMap<>();
}
innerMap.put(node.getEnd(), node);
npMap.put(node.getBegin(), innerMap);
@@ -168,7 +168,7 @@ public class MipacqMarkableExpander exte
* surgical procedures
*/
- private void elevateAdjectives(JCas jcas){
+ private static void elevateAdjectives(JCas jcas){
FSIterator<Annotation> markables = jcas.getAnnotationIndex(NEMarkable.type).iterator();
while(markables.hasNext()){
NEMarkable mark = (NEMarkable) markables.next();
@@ -187,9 +187,9 @@ public class MipacqMarkableExpander exte
}
- private void rmDup(JCas aJCas, LinkedList<Annotation> markables) {
- HashSet<Annotation> rm = new HashSet<Annotation>();
- HashMap<String,Annotation> keep = new HashMap<String,Annotation>();
+ private static void rmDup(LinkedList<Annotation> markables) {
+ HashSet<Annotation> rm = new HashSet<>();
+ HashMap<String,Annotation> keep = new HashMap<>();
for (int i = 0; i < markables.size(); i++) {
Annotation m1 = markables.get(i);
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkablePairGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkablePairGenerator.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkablePairGenerator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqMarkablePairGenerator.java Mon Feb 1 19:37:16 2016
@@ -22,62 +22,39 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.HashSet;
-import java.util.Hashtable;
import java.util.LinkedList;
-import java.util.Vector;
-import org.apache.log4j.Logger;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.EmptyFSList;
-import org.apache.uima.jcas.cas.FSList;
-import org.apache.uima.jcas.cas.NonEmptyFSList;
-import org.apache.uima.jcas.cas.NonEmptyFloatList;
-import org.apache.uima.jcas.tcas.Annotation;
import org.apache.ctakes.coreference.type.BooleanLabeledFS;
-
-
-import org.apache.ctakes.core.resource.FileResource;
-import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
-import org.apache.ctakes.coreference.eval.helpers.Span;
-import org.apache.ctakes.coreference.eval.helpers.SpanAlignment;
-import org.apache.ctakes.coreference.eval.helpers.SpanOffsetComparator;
+import org.apache.ctakes.coreference.type.DemMarkable;
+import org.apache.ctakes.coreference.type.Markable;
+import org.apache.ctakes.coreference.type.MarkablePairSet;
+import org.apache.ctakes.coreference.type.NEMarkable;
+import org.apache.ctakes.coreference.type.PronounMarkable;
import org.apache.ctakes.coreference.util.CorefConsts;
import org.apache.ctakes.coreference.util.FSIteratorToList;
-import org.apache.ctakes.coreference.util.MarkableTreeUtils;
import org.apache.ctakes.coreference.util.PairAttributeCalculator;
-import org.apache.ctakes.coreference.util.ParentPtrTree;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
-import org.apache.ctakes.coreference.type.DemMarkable;
-import org.apache.ctakes.coreference.type.Markable;
-import org.apache.ctakes.coreference.type.MarkablePair;
-import org.apache.ctakes.coreference.type.MarkablePairSet;
-import org.apache.ctakes.coreference.type.NEMarkable;
-import org.apache.ctakes.coreference.type.PronounMarkable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
public class MipacqMarkablePairGenerator extends JCasAnnotator_ImplBase {
+ public static final String PARAM_STOPWORDS_FILE = "StopFile";
+ @ConfigurationParameter(name=PARAM_STOPWORDS_FILE, mandatory=false, defaultValue="org/apache/ctakes/coreference/models/stop.txt")
+ File stopwordFile = null;
+ HashSet<String> stopwords;
+
// LOG4J logger based on class name
private Logger logger = Logger.getLogger(getClass().getName());
- private int maxSpanID = 0;
- HashSet<String> stopwords;
-// ParentPtrTree ppt;
-//
-// Vector<Span> goldSpans = null;
-// Hashtable<String,Integer> goldSpan2id = null;
-// Vector<int[]> goldPairs = null;
-//
-// Vector<Span> sysSpans = null;
-// Hashtable<String,Integer> sysSpan2id = null;
-// Vector<int[]> sysPairs = null;
-// Hashtable<Integer, Integer> sysId2AlignId = null;
-// Hashtable<Integer, Integer> goldId2AlignId = null;
-// Hashtable<Integer, Integer> alignId2GoldId = null;
-// int[] goldEqvCls;
int numVecs = 0;
@Override
@@ -86,20 +63,20 @@ public class MipacqMarkablePairGenerator
// Load stop words list
try {
- stopwords = new HashSet<String>();
- FileResource r = (FileResource) uc.getResourceObject("stopWords");
- BufferedReader br = new BufferedReader(new FileReader(r.getFile()));
- String l;
- while ((l = br.readLine())!=null) {
- l = l.trim();
- if (l.length()==0) continue;
- int i = l.indexOf('|');
- if (i > 0)
- stopwords.add(l.substring(0,i).trim());
- else if (i < 0)
- stopwords.add(l.trim());
+ stopwords = new HashSet<>();
+ try(BufferedReader br = new BufferedReader(new FileReader(stopwordFile))){
+ String l;
+ while ((l = br.readLine())!=null) {
+ l = l.trim();
+ if (l.length()==0) continue;
+ int i = l.indexOf('|');
+ if (i > 0)
+ stopwords.add(l.substring(0,i).trim());
+ else if (i < 0)
+ stopwords.add(l.trim());
+ }
}
- logger.info("Stop words list loaded: " + r.getFile().getAbsolutePath());
+ logger.info("Stop words list loaded: " + stopwordFile.getAbsolutePath());
} catch (Exception e) {
e.printStackTrace();
logger.error("Error loading stop words list");
@@ -111,33 +88,12 @@ public class MipacqMarkablePairGenerator
public void process(JCas jcas) throws AnalysisEngineProcessException {
// read the gold standard
numVecs = 0;
-// sysId2AlignId = new Hashtable<Integer, Integer>();
-// goldId2AlignId = new Hashtable<Integer, Integer>();
-// alignId2GoldId = new Hashtable<Integer, Integer>();
- String docName = DocumentIDAnnotationUtil.getDocumentID(jcas);
-// if (docName==null) docName = "141471681_1";
-// System.out.print("creating vectors for "+docName);
-// loadGoldStandard(docName);
-// else loadGoldStandard();
+// String docName = DocumentIDAnnotationUtil.getDocumentID(jcas);
// Convert the orderless FSIterator to List, sort by char offsets
LinkedList<Annotation> lm = FSIteratorToList.convert(
jcas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator());
-// loadSystemPairs(lm);
-// // align the spans
-// SpanAlignment sa = new SpanAlignment(goldSpans.toArray(new Span[goldSpans.size()]),
-// sysSpans.toArray(new Span[sysSpans.size()]));
-//
-// int[] id = sa.get1();
-// for (int i = 0; i < id.length; i++){
-// alignId2GoldId.put(id[i]+maxSpanID, goldSpan2id.get(goldSpans.get(i).toString()));
-// goldId2AlignId.put(goldSpan2id.get(goldSpans.get(i).toString()), id[i] + maxSpanID);
-// }
-// id = sa.get2();
-// for (int i = 0; i < id.length; i++){
-// sysId2AlignId.put(sysSpan2id.get(sysSpans.get(i).toString()), id[i]+maxSpanID);
-// }
// now iterate over system markables and add the ones that match gold standard as
// true, otherwise false
for (int p = 1; p < lm.size(); ++p) {
@@ -218,13 +174,6 @@ public class MipacqMarkablePairGenerator
tail = (NonEmptyFSList) tail.getTail();
}
tail.setHead(labeledAntecedent);
-// if (isGoldPair(a, m)){
-// labeledAntecedent.setLabel(true);
-// // FIXME this cannot be done, it's implicitly looking at the label and changing the possible outcomes...
-// break; // stop if a gold pair is found
-// }else{
-// labeledAntecedent.setLabel(false);
-// }
}
if(tail == null) pairList.setAntecedentList(new EmptyFSList(jcas));
else tail.setTail(new EmptyFSList(jcas));
@@ -254,13 +203,6 @@ public class MipacqMarkablePairGenerator
tail = (NonEmptyFSList) tail.getTail();
}
tail.setHead(labeledAntecedent);
-// if (isGoldPair(a, m)){
-// // FIXME
-// labeledAntecedent.setLabel(true);
-// break; // stop if a gold pair is found
-// }else{
-// labeledAntecedent.setLabel(false);
-// }
}
if(tail == null) pairList.setAntecedentList(new EmptyFSList(jcas));
else tail.setTail(new EmptyFSList(jcas));
@@ -294,13 +236,6 @@ public class MipacqMarkablePairGenerator
tail = (NonEmptyFSList) tail.getTail();
}
tail.setHead(labeledAntecedent);
-// if (isGoldPair(a, m)){
-// // FIXME
-// labeledAntecedent.setLabel(true);
-// break; // stop if a gold pair is found
-// }else{
-// labeledAntecedent.setLabel(false);
-// }
}
if(tail == null) pairList.setAntecedentList(new EmptyFSList(jcas));
else tail.setTail(new EmptyFSList(jcas));
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java Mon Feb 1 19:37:16 2016
@@ -18,6 +18,7 @@
*/
package org.apache.ctakes.coreference.ae;
+import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@@ -26,11 +27,6 @@ import java.util.List;
import java.util.Map;
import java.util.Scanner;
-import libsvm.svm;
-import libsvm.svm_model;
-import libsvm.svm_node;
-
-import org.apache.ctakes.core.resource.FileResource;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.coreference.type.BooleanLabeledFS;
import org.apache.ctakes.coreference.type.DemMarkable;
@@ -49,12 +45,12 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
import org.apache.ctakes.typesystem.type.relation.RelationArgument;
import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.EmptyFSList;
import org.apache.uima.jcas.cas.FSList;
@@ -62,14 +58,27 @@ import org.apache.uima.jcas.cas.NonEmpty
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
+import libsvm.svm_node;
+
public class MipacqSvmChainCreator extends JCasAnnotator_ImplBase {
+ public static final String PARAM_STOPWORDS_FILENAME = "StopWords";
+ @ConfigurationParameter(name = PARAM_STOPWORDS_FILENAME, mandatory=false, defaultValue="org/apache/ctakes/coreference/models/stop.txt")
+ File stopwordFile = null;
+ HashSet<String> stopwords;
+
+ public static final String PARAM_FRAGS_FILENAME = "FragsFile";
+ @ConfigurationParameter(name = PARAM_FRAGS_FILENAME, mandatory=false, defaultValue="org/apache/ctakes/coreference/models/frags.txt")
+ File treefragFile = null;
+ private ArrayList<String> treeFrags;
+
+ public static final String PARAM_COREF_MODEL = "ModelFile";
+ @ConfigurationParameter(name = PARAM_COREF_MODEL, mandatory=false, defaultValue="org/apache/ctakes/coreference/models/ne.mayo.rbf.model")
+ File modelFile = null;
+
// LOG4J logger based on class name
private Logger logger = Logger.getLogger(getClass().getName());
- // debug
- private boolean debug = false;
-
// svm models
// private AbstractClassifier mod_pron, mod_dem, mod_coref;
private AbstractClassifier mod_coref;
@@ -80,24 +89,6 @@ public class MipacqSvmChainCreator exten
// private int coref_idx;
private SvmVectorCreator vecCreator = null;
-// ParentPtrTree ppt;
-
- HashSet<String> stopwords;
- private ArrayList<String> treeFrags;
-
-/* private svm_model loadModel (UimaContext uc, String m) {
- svm_model ret = null;
- try {
- String r = ((FileResource) uc.getResourceObject(m)).getFile().getAbsolutePath();
- ret = svm.svm_load_model(r);
- logger.info(m+" loaded: "+r);
- } catch (Exception e) {
- e.printStackTrace();
- logger.error("Error loading "+m);
- }
- return ret;
- }
-*/
@Override
public void initialize(UimaContext uc) throws ResourceInitializationException {
super.initialize(uc);
@@ -107,43 +98,35 @@ public class MipacqSvmChainCreator exten
// FIXME why is there a minus one here?
// mod_pron = new AbstractClassifier(uc, "svmPronModel", FeatureVector.getPronCorefFeatures().length + SyntaxAttributeCalculator.getNumPronFeats() - 1);
// mod_dem = new AbstractClassifier(uc, "svmDemModel", FeatureVector.getDemCorefFeatures().length + SyntaxAttributeCalculator.getNumDemFeats() - 1);
- mod_coref = new AbstractClassifier(uc, "svmCorefModel", FeatureVector.getNECorefFeatures().length + SyntaxAttributeCalculator.getNumNEFeats() - 1);
-
-// int[] labels = new int[2];
-// svm.svm_get_labels(mod_anaphoricity, labels);
-// anaphoricity_idx = labels[0]==1 ? 0 : 1;
- // svm.svm_get_labels(mod_coref, labels);
- // coref_idx = labels[0]==1 ? 0 : 1;
+ mod_coref = new AbstractClassifier(modelFile, FeatureVector.getNECorefFeatures().length + SyntaxAttributeCalculator.getNumNEFeats() - 1);
// Load stop words list
try {
- stopwords = new HashSet<String>();
- FileResource r = (FileResource) uc.getResourceObject("stopWords");
- Scanner scanner = new Scanner(r.getFile());
- String l;
- while (scanner.hasNextLine()) {
- l = scanner.nextLine().trim();
- if (l.length()==0) continue;
- int i = l.indexOf('|');
- if (i > 0)
- stopwords.add(l.substring(0,i).trim());
- else if (i < 0)
- stopwords.add(l.trim());
+ stopwords = new HashSet<>();
+ try(Scanner scanner = new Scanner(stopwordFile)){
+ String l;
+ while (scanner.hasNextLine()) {
+ l = scanner.nextLine().trim();
+ if (l.length()==0) continue;
+ int i = l.indexOf('|');
+ if (i > 0)
+ stopwords.add(l.substring(0,i).trim());
+ else if (i < 0)
+ stopwords.add(l.trim());
+ }
+ logger.info("Stop words list loaded: " + stopwordFile.getAbsolutePath());
+ vecCreator = new SvmVectorCreator(stopwords);
}
- logger.info("Stop words list loaded: " + r.getFile().getAbsolutePath());
- vecCreator = new SvmVectorCreator(stopwords);
+ treeFrags = new ArrayList<>();
+ try(Scanner scanner = new Scanner(treefragFile)){
- treeFrags = new ArrayList<String>();
- r = (FileResource) uc.getResourceObject("frags");
- if(r != null){
- scanner = new Scanner(r.getFile());
- while(scanner.hasNextLine()){
- String line = scanner.nextLine();
- treeFrags.add(line.split(" ")[1]);
- }
- vecCreator.setFrags(treeFrags);
+ while(scanner.hasNextLine()){
+ String line = scanner.nextLine();
+ treeFrags.add(line.split(" ")[1]);
+ }
+ vecCreator.setFrags(treeFrags);
+ logger.info("Tree fragment features loaded: " + treefragFile.getAbsolutePath());
}
- logger.info("Tree fragment features loaded: " + r.getFile().getAbsolutePath());
} catch (Exception e) {
e.printStackTrace();
logger.error("Error loading stop words list");
@@ -155,7 +138,7 @@ public class MipacqSvmChainCreator exten
// Convert the orderless FSIterator to List, sort by char offsets
LinkedList<Annotation> lm = FSIteratorToList.convert(
jcas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator());
- Map<Markable, NonEmptyFSList> collectionRas = new HashMap<Markable, NonEmptyFSList>();
+ Map<Markable, NonEmptyFSList> collectionRas = new HashMap<>();
String docName = DocumentIDAnnotationUtil.getDocumentID(jcas);
logger.info("Classifying coreference in document: " + docName);
// ArrayList<CollectionTextRelation> chains = new ArrayList<CollectionTextRelation>();
@@ -166,7 +149,7 @@ public class MipacqSvmChainCreator exten
// ppt = new ParentPtrTree(lm.size());
// Make a data structure mapping markables to indexes so we don't lose the order if we re-arrange
- Map<Markable, Integer> m2q = new HashMap<Markable,Integer>();
+ Map<Markable, Integer> m2q = new HashMap<>();
for(int p = 0; p < lm.size(); p++){
m2q.put((Markable)lm.get(p), p);
@@ -176,7 +159,7 @@ public class MipacqSvmChainCreator exten
while(iter.hasNext()){
MarkablePairSet set = (MarkablePairSet) iter.next();
Markable anaphor = set.getAnaphor();
- FSList fs = (FSList) set.getAntecedentList();
+ FSList fs = set.getAntecedentList();
MarkableProb bestAnte = null;
LinkedList<Markable> ll = fs2ll(fs);
if(anaphor instanceof PronounMarkable){
@@ -243,17 +226,7 @@ public class MipacqSvmChainCreator exten
chain.setMembers(anteNode);
chain.addToIndexes();
}
- anteNode.setTail(node);
-
-
-// ppt.union(m2q.get(anaphor), m2q.get(bestAnte.m));
- if(anaphor instanceof PronounMarkable){
- // if the anaphor is a pronoun then it won't be in the cas as an identifiedannotation so we need to add it.
- IdentifiedAnnotation ia = new IdentifiedAnnotation(jcas);
- // TODO
- }
- }else{
-// indexNegativeExample(jcas, bestAnte.m, anaphor, bestAnte.prob);
+ anteNode.setTail(node);
}
}
logger.info("Done classifying document: " + docName);
@@ -300,8 +273,8 @@ public class MipacqSvmChainCreator exten
}
- private LinkedList<Markable> fs2ll(FSList fs) {
- LinkedList<Markable> ll = new LinkedList<Markable>();
+ private static LinkedList<Markable> fs2ll(FSList fs) {
+ LinkedList<Markable> ll = new LinkedList<>();
while(fs instanceof NonEmptyFSList){
NonEmptyFSList node = (NonEmptyFSList) fs;
BooleanLabeledFS feat = (BooleanLabeledFS) node.getHead();
@@ -311,7 +284,8 @@ public class MipacqSvmChainCreator exten
}
return ll;
}
-
+
+ /*
private MarkableProb processPronoun(Markable anaphor, LinkedList<Markable> anteList, JCas jcas){
Markable ante = null;
double bestProb = 0.0;
@@ -334,7 +308,7 @@ public class MipacqSvmChainCreator exten
}
return new MarkableProb(ante, bestProb);
}
-
+*/
private MarkableProb processNE(Markable anaphor, List<Markable> anteList, JCas jcas){
Markable ante = null;
double bestProb = 0.0;
@@ -353,12 +327,15 @@ public class MipacqSvmChainCreator exten
return new MarkableProb(ante, bestProb);
}
- private MarkableProb processNELazily(Markable anaphor, List<Markable> anteList, JCas jcas){
+ /*
+ private static MarkableProb processNELazily(Markable anaphor, List<Markable> anteList, JCas jcas){
if(anteList.size() > 0) return new MarkableProb(anteList.get(0), 1.0);
- else return new MarkableProb(null,0.0);
+
+ return new MarkableProb(null,0.0);
}
+ */
- private MarkableProb processDem(Markable anaphor, List<Markable> anteList, JCas jcas){
+ private static MarkableProb processDem(Markable anaphor, List<Markable> anteList, JCas jcas){
double bestProb = 0.0;
TreebankNode n = MarkableTreeUtils.markableNode(jcas, anaphor.getBegin(), anaphor.getEnd());
TreebankNode parent = (n != null ? n.getParent() : null);
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AbstractClassifier.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AbstractClassifier.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AbstractClassifier.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AbstractClassifier.java Mon Feb 1 19:37:16 2016
@@ -36,19 +36,14 @@ public class AbstractClassifier {
private svm_model svmCls = null;
private int clsIndex = -1;
- public AbstractClassifier(UimaContext uc, String key, int len) {
+ public AbstractClassifier(File fn, int len) {
try{
- File file = ((FileResource)uc.getResourceObject(key)).getFile();
- String fn = file.getAbsolutePath();
- svmCls = svm.svm_load_model(fn);
+ svmCls = svm.svm_load_model(fn.getAbsolutePath());
int[] labels = new int[2];
svm.svm_get_labels(svmCls, labels);
clsIndex = labels[0]==1 ? 0 : 1;
}catch(IOException e){
e.printStackTrace();
- } catch (ResourceAccessException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
}
}
Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AnnotationSelector.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AnnotationSelector.java?rev=1727990&r1=1727989&r2=1727990&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AnnotationSelector.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/AnnotationSelector.java Mon Feb 1 19:37:16 2016
@@ -22,12 +22,7 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
//import java.util.Iterator;
-
-import org.apache.log4j.Logger;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.FSArray;
-import org.apache.uima.jcas.tcas.Annotation;
+import java.util.Set;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.Chunk;
@@ -38,6 +33,11 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.tcas.Annotation;
// TODO: This class hardcoded all the criteria,
// which should be replaced by a parser of
@@ -83,7 +83,7 @@ public class AnnotationSelector {
}
public static ArrayList<WordToken> selectPronoun (JCas jcas,
- HashSet<String> modalAdj, HashSet<String> cogved, HashSet<String> othervb,
+ Set<String> modalAdj, Set<String> cogved, Set<String> othervb,
Logger logger) {
Hashtable<String, WordToken> offset2token = new Hashtable<String, WordToken>();
ArrayList<WordToken> ret = new ArrayList<WordToken>();
@@ -118,7 +118,7 @@ public class AnnotationSelector {
}
private static boolean isPleonastic (TerminalTreebankNode ttn,
- HashSet<String> modalAdj, HashSet<String> cogved, HashSet<String> othervb) {
+ Set<String> modalAdj, Set<String> cogved, Set<String> othervb) {
if (!ttn.getCoveredText().equalsIgnoreCase("it")) return false;
if (ttn.getNodeType().equals("PRP")) {
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java?rev=1727990&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/CoreferencePipelineFactory.java Mon Feb 1 19:37:16 2016
@@ -0,0 +1,24 @@
+package org.apache.ctakes.coreference.util;
+
+import org.apache.ctakes.coreference.ae.MipacqMarkableCreator;
+import org.apache.ctakes.coreference.ae.MipacqMarkableExpander;
+import org.apache.ctakes.coreference.ae.MipacqMarkablePairGenerator;
+import org.apache.ctakes.coreference.ae.MipacqSvmChainCreator;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class CoreferencePipelineFactory {
+
+ public static AnalysisEngineDescription getCoreferencePipeline() throws ResourceInitializationException{
+ AggregateBuilder builder = new AggregateBuilder();
+
+ builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkableCreator.class));
+ builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkableExpander.class));
+ builder.add(AnalysisEngineFactory.createEngineDescription(MipacqMarkablePairGenerator.class));
+ builder.add(AnalysisEngineFactory.createEngineDescription(MipacqSvmChainCreator.class));
+
+ return builder.createAggregateDescription();
+ }
+}