You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/01/07 23:49:57 UTC
svn commit: r1430073 [1/7] - in /incubator/ctakes/trunk/ctakes-coreference: ./ .settings/ desc/ desc/analysis_engine/ desc/cas_consumer/ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/cc/ src/main/java/org/a...

Author: tmill
Date: Mon Jan  7 22:49:52 2013
New Revision: 1430073

URL: http://svn.apache.org/viewvc?rev=1430073&view=rev
Log:
ctakes-117: Adds features for doing mention similarity using statistics from wiki articles.
Additional features multiply other features together in meaningful ways:
(shared cui X entity type) e.g., two mentions of "liver" is probably the same because
a person only has one liver.  two mentions of "pain" are less likely to be the same.
This feature captures that intuition by having type-specific synonym features.


Added:
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdt   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdx   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fnm   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.frq   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.nrm   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.prx   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.tii   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.tis   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.tvd   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.tvf   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.tvx   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/segments.gen   (with props)
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/segments_1   (with props)
Modified:
    incubator/ctakes/trunk/ctakes-coreference/.classpath
    incubator/ctakes/trunk/ctakes-coreference/.settings/org.eclipse.core.resources.prefs
    incubator/ctakes/trunk/ctakes-coreference/NOTICE
    incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableCreator.xml
    incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableExpander.xml
    incubator/ctakes/trunk/ctakes-coreference/desc/MipacqSvmChainCreator.xml
    incubator/ctakes/trunk/ctakes-coreference/desc/analysis_engine/ODIESvmVectorCreator.xml
    incubator/ctakes/trunk/ctakes-coreference/desc/cas_consumer/ODIEVectorFileWriterCasConsumer.xml
    incubator/ctakes/trunk/ctakes-coreference/pom.xml
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/FeatureVector.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/PairAttributeCalculator.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SvmVectorCreator.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SyntaxAttributeCalculator.java
    incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/ne.mayo.rbf.model

Modified: incubator/ctakes/trunk/ctakes-coreference/.classpath
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/.classpath?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/.classpath (original)
+++ incubator/ctakes/trunk/ctakes-coreference/.classpath Mon Jan  7 22:49:52 2013
@@ -7,18 +7,18 @@
 		</attributes>
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/UIMA"/>
-	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
+	<classpathentry kind="src" output="target/classes" path="target/generated-sources/jcasgen">
 		<attributes>
+			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
 		<attributes>
-			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="src" output="target/classes" path="target/generated-sources/jcasgen">
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
 		<attributes>
 			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>

Modified: incubator/ctakes/trunk/ctakes-coreference/.settings/org.eclipse.core.resources.prefs
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/.settings/org.eclipse.core.resources.prefs?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/.settings/org.eclipse.core.resources.prefs (original)
+++ incubator/ctakes/trunk/ctakes-coreference/.settings/org.eclipse.core.resources.prefs Mon Jan  7 22:49:52 2013
@@ -1,5 +1,6 @@
 eclipse.preferences.version=1
 encoding//src/main/java=UTF-8
 encoding//src/main/resources=UTF-8
+encoding//src/test/java=UTF-8
 encoding//target/generated-sources/jcasgen=UTF-8
 encoding/<project>=UTF-8

Modified: incubator/ctakes/trunk/ctakes-coreference/NOTICE
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/NOTICE?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/NOTICE (original)
+++ incubator/ctakes/trunk/ctakes-coreference/NOTICE Mon Jan  7 22:49:52 2013
@@ -1,5 +1,5 @@
 ==============================================================================================
-					Copyright to Children's Hostpital Boston
+					Copyright to Children's Hospital Boston
 ==============================================================================================
 
 This product includes software (OpenAI_FSM.jar) developed by OpenAi Labs.

Modified: incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableCreator.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableCreator.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableCreator.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableCreator.xml Mon Jan  7 22:49:52 2013
@@ -101,7 +101,7 @@
     </configurationParameterSettings>
     <typeSystemDescription>
       <imports>
-        <import location="type-system/CorefTypes.xml"/>
+        <import name="org.apache.ctakes.coreference.types.TypeSystem"/>
         <import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
       </imports>
     </typeSystemDescription>

Modified: incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableExpander.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableExpander.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableExpander.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/desc/MipacqMarkableExpander.xml Mon Jan  7 22:49:52 2013
@@ -33,7 +33,7 @@
     <typeSystemDescription>
       <imports>
         <import name="org.apache.ctakes.typesystem.types.TypeSystem"/>
-        <import location="type-system/CorefTypes.xml"/>
+        <import name="org.apache.ctakes.coreference.types.TypeSystem"/>
       </imports>
     </typeSystemDescription>
     <typePriorities/>

Modified: incubator/ctakes/trunk/ctakes-coreference/desc/MipacqSvmChainCreator.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/desc/MipacqSvmChainCreator.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/desc/MipacqSvmChainCreator.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/desc/MipacqSvmChainCreator.xml Mon Jan  7 22:49:52 2013
@@ -32,7 +32,7 @@
     <configurationParameterSettings/>
     <typeSystemDescription>
       <imports>
-        <import location="type-system/CorefTypes.xml"/>
+        <import name="org.apache.ctakes.coreference.types.TypeSystem"/>
       </imports>
     </typeSystemDescription>
     <typePriorities/>

Modified: incubator/ctakes/trunk/ctakes-coreference/desc/analysis_engine/ODIESvmVectorCreator.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/desc/analysis_engine/ODIESvmVectorCreator.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/desc/analysis_engine/ODIESvmVectorCreator.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/desc/analysis_engine/ODIESvmVectorCreator.xml Mon Jan  7 22:49:52 2013
@@ -24,7 +24,7 @@
   <primitive>false</primitive>
   <delegateAnalysisEngineSpecifiers>
     <delegateAnalysisEngine key="ConstituencyParserAnnotator">
-      <import location="../../../Constituency Parser/desc/ConstituencyParserAnnotator.xml"/>
+      <import location="../../../ctakes-constituency-parser/desc/ConstituencyParserAnnotator.xml"/>
     </delegateAnalysisEngine>
     <delegateAnalysisEngine key="TokenizerAnnotator">
       <import location="../../../ctakes-core/desc/analysis_engine/TokenizerAnnotator.xml"/>
@@ -47,9 +47,6 @@
     <delegateAnalysisEngine key="SentenceDetectorAnnotator">
       <import location="../../../ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml"/>
     </delegateAnalysisEngine>
-    <delegateAnalysisEngine key="ODIEVectorFileWriterCasConsumer">
-      <import location="../cas_consumer/ODIEVectorFileWriterCasConsumer.xml"/>
-    </delegateAnalysisEngine>
     <delegateAnalysisEngine key="SimpleSegmentAnnotator">
       <import location="../../../ctakes-clinical-pipeline/desc/analysis_engine/SimpleSegmentAnnotator.xml"/>
     </delegateAnalysisEngine>
@@ -97,74 +94,12 @@
           <parameter>Chunker/ChunkCreatorClass</parameter>
         </overrides>
       </configurationParameter>
-      <configurationParameter>
-        <name>outputDir</name>
-        <description>Where the files will be written to</description>
-        <type>String</type>
-        <multiValued>false</multiValued>
-        <mandatory>true</mandatory>
-        <overrides>
-          <parameter>ODIEVectorFileWriterCasConsumer/outputDir</parameter>
-        </overrides>
-      </configurationParameter>
-      <configurationParameter>
-        <name>goldStandardDir</name>
-        <type>String</type>
-        <multiValued>false</multiValued>
-        <mandatory>true</mandatory>
-        <overrides>
-          <parameter>ODIEVectorFileWriterCasConsumer/goldStandardDir</parameter>
-        </overrides>
-      </configurationParameter>
-      <configurationParameter>
-        <name>writeVectors</name>
-        <description>Should the consumer print out the vector pairs for the coreferent markables?</description>
-        <type>Boolean</type>
-        <multiValued>false</multiValued>
-        <mandatory>true</mandatory>
-        <overrides>
-          <parameter>ODIEVectorFileWriterCasConsumer/writeVectors</parameter>
-        </overrides>
-      </configurationParameter>
-      <configurationParameter>
-        <name>writeTrees</name>
-        <type>Boolean</type>
-        <multiValued>false</multiValued>
-        <mandatory>true</mandatory>
-        <overrides>
-          <parameter>ODIEVectorFileWriterCasConsumer/writeTrees</parameter>
-        </overrides>
-      </configurationParameter>
     </configurationParameters>
     <configurationParameterSettings>
       <nameValuePair>
         <name>ChunkCreatorClass</name>
         <value>
-          <string>edu.mayo.bmi.uima.chunker.PhraseTypeChunkCreator</string>
-        </value>
-      </nameValuePair>
-      <nameValuePair>
-        <name>outputDir</name>
-        <value>
-          <string>change me</string>
-        </value>
-      </nameValuePair>
-      <nameValuePair>
-        <name>goldStandardDir</name>
-        <value>
-          <string>chang me</string>
-        </value>
-      </nameValuePair>
-      <nameValuePair>
-        <name>writeVectors</name>
-        <value>
-          <boolean>true</boolean>
-        </value>
-      </nameValuePair>
-      <nameValuePair>
-        <name>writeTrees</name>
-        <value>
-          <boolean>false</boolean>
+          <string>org.apache.ctakes.chunker.ae.PhraseTypeChunkCreator</string>
         </value>
       </nameValuePair>
     </configurationParameterSettings>
@@ -185,7 +120,6 @@
         <node>MipacqMarkableCreator</node>
         <node>MipacqMarkableExpander</node>
         <node>MipacqMarkablePairGenerator</node>
-        <node>ODIEVectorFileWriterCasConsumer</node>
       </fixedFlow>
     </flowConstraints>
     <typePriorities>

Modified: incubator/ctakes/trunk/ctakes-coreference/desc/cas_consumer/ODIEVectorFileWriterCasConsumer.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/desc/cas_consumer/ODIEVectorFileWriterCasConsumer.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/desc/cas_consumer/ODIEVectorFileWriterCasConsumer.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/desc/cas_consumer/ODIEVectorFileWriterCasConsumer.xml Mon Jan  7 22:49:52 2013
@@ -139,7 +139,7 @@
         <name>stopWordsFile</name>
         <description/>
         <fileResourceSpecifier>
-          <fileUrl>file:stop.txt</fileUrl>
+          <fileUrl>file:org/apache/ctakes/coreference/models/stop.txt</fileUrl>
         </fileResourceSpecifier>
         <implementationName>org.apache.ctakes.core.resource.FileResourceImpl</implementationName>
       </externalResource>
@@ -147,7 +147,7 @@
         <name>treeFragFile</name>
         <description/>
         <fileResourceSpecifier>
-          <fileUrl>file:frags.txt</fileUrl>
+          <fileUrl>file:org/apache/ctakes/coreference/models/frags.txt</fileUrl>
         </fileResourceSpecifier>
         <implementationName>org.apache.ctakes.core.resource.FileResourceImpl</implementationName>
       </externalResource>

Modified: incubator/ctakes/trunk/ctakes-coreference/pom.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/pom.xml?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/pom.xml (original)
+++ incubator/ctakes/trunk/ctakes-coreference/pom.xml Mon Jan  7 22:49:52 2013
@@ -62,6 +62,31 @@
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-relation-extractor</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-dictionary-lookup</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>net.sourceforge.ctakesresources</groupId>
+			<artifactId>ctakes-resources</artifactId>
+			<version>3.1.0</version>
+			<type>pom</type>
+		</dependency>
+		<dependency>
+			<groupId>net.sourceforge.ctakesresources</groupId>
+			<artifactId>ctakes-resources-umls2011ab</artifactId>
+			<version>3.1.0</version>
+		</dependency>
+		<dependency>
+			<groupId>net.sourceforge.ctakesresources</groupId>
+			<artifactId>ctakes-resources-distribution</artifactId>
+			<version>3.1.0</version>
+			<type>pom</type>
+		</dependency>
 	</dependencies>
 	
 		<build>

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/MipacqSvmChainCreator.java Mon Jan  7 22:49:52 2013
@@ -18,8 +18,6 @@
  */
 package org.apache.ctakes.coreference.ae;
 
-import java.io.BufferedReader;
-import java.io.FileReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -32,38 +30,37 @@ import libsvm.svm;
 import libsvm.svm_model;
 import libsvm.svm_node;
 
-import org.apache.log4j.Logger;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.EmptyFSList;
-import org.apache.uima.jcas.cas.FSList;
-import org.apache.uima.jcas.cas.NonEmptyFSList;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.ctakes.coreference.type.BooleanLabeledFS;
-
-
 import org.apache.ctakes.core.resource.FileResource;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.coreference.type.BooleanLabeledFS;
+import org.apache.ctakes.coreference.type.DemMarkable;
+import org.apache.ctakes.coreference.type.Markable;
+import org.apache.ctakes.coreference.type.MarkablePairSet;
+import org.apache.ctakes.coreference.type.NEMarkable;
+import org.apache.ctakes.coreference.type.PronounMarkable;
 import org.apache.ctakes.coreference.util.AbstractClassifier;
 import org.apache.ctakes.coreference.util.CorefConsts;
 import org.apache.ctakes.coreference.util.FSIteratorToList;
 import org.apache.ctakes.coreference.util.FeatureVector;
 import org.apache.ctakes.coreference.util.MarkableTreeUtils;
-import org.apache.ctakes.coreference.util.ParentPtrTree;
+import org.apache.ctakes.coreference.util.SvmVectorCreator;
 import org.apache.ctakes.coreference.util.SyntaxAttributeCalculator;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
 import org.apache.ctakes.typesystem.type.relation.RelationArgument;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
-import org.apache.ctakes.coreference.type.DemMarkable;
-import org.apache.ctakes.coreference.type.Markable;
-import org.apache.ctakes.coreference.type.MarkablePairSet;
-import org.apache.ctakes.coreference.type.NEMarkable;
-import org.apache.ctakes.coreference.type.PronounMarkable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
 
 public class MipacqSvmChainCreator extends JCasAnnotator_ImplBase {
 
@@ -74,18 +71,21 @@ public class MipacqSvmChainCreator exten
 	private boolean debug = false;
 
 	// svm models
-	private AbstractClassifier mod_pron, mod_dem, mod_coref;
-	private svm_model mod_anaphoricity;
+//	private AbstractClassifier mod_pron, mod_dem, mod_coref;
+	private AbstractClassifier mod_coref;
+//	private AbstractClassifier mod_pron;
+//	private svm_model mod_anaphoricity;
 	// positive class label index in libsvm's prob_est array
-	private int anaphoricity_idx, coref_idx;
-	private org.apache.ctakes.coreference.util.SvmVectorCreator vecCreator = null;
+//	private int anaphoricity_idx
+//	private int coref_idx;
+	private SvmVectorCreator vecCreator = null;
 	
-	ParentPtrTree ppt;
+//	ParentPtrTree ppt;
 
 	HashSet<String> stopwords;
 	private ArrayList<String> treeFrags;
 
-	private svm_model loadModel (UimaContext uc, String m) {
+/*	private svm_model loadModel (UimaContext uc, String m) {
 		svm_model ret = null;
 		try {
 			String r = ((FileResource) uc.getResourceObject(m)).getFile().getAbsolutePath();
@@ -97,21 +97,21 @@ public class MipacqSvmChainCreator exten
 		}
 		return ret;
 	}
-
+*/
 	@Override
 	public void initialize(UimaContext uc) throws ResourceInitializationException {
 		super.initialize(uc);
 
 		// Load svm models
-		mod_anaphoricity = loadModel(uc, "svmAnaphoricityModel");
+//		mod_anaphoricity = loadModel(uc, "svmAnaphoricityModel");
 		// FIXME why is there a minus one here?
-		mod_pron = new AbstractClassifier(uc, "svmPronModel", FeatureVector.getPronCorefFeatures().length + SyntaxAttributeCalculator.getNumPronFeats() - 1);
+//		mod_pron = new AbstractClassifier(uc, "svmPronModel", FeatureVector.getPronCorefFeatures().length + SyntaxAttributeCalculator.getNumPronFeats() - 1);
 //		mod_dem = new AbstractClassifier(uc, "svmDemModel", FeatureVector.getDemCorefFeatures().length + SyntaxAttributeCalculator.getNumDemFeats() - 1);
 		mod_coref = new AbstractClassifier(uc, "svmCorefModel", FeatureVector.getNECorefFeatures().length + SyntaxAttributeCalculator.getNumNEFeats() - 1);
 
-		int[] labels = new int[2];
-		svm.svm_get_labels(mod_anaphoricity, labels);
-		anaphoricity_idx = labels[0]==1 ? 0 : 1;
+//		int[] labels = new int[2];
+//		svm.svm_get_labels(mod_anaphoricity, labels);
+//		anaphoricity_idx = labels[0]==1 ? 0 : 1;
 		//		svm.svm_get_labels(mod_coref, labels);
 		//		coref_idx = labels[0]==1 ? 0 : 1;
 
@@ -130,7 +130,8 @@ public class MipacqSvmChainCreator exten
 				else if (i < 0)
 					stopwords.add(l.trim());
 			}
-			vecCreator = new org.apache.ctakes.coreference.util.SvmVectorCreator(stopwords, mod_anaphoricity);
+			logger.info("Stop words list loaded: " + r.getFile().getAbsolutePath());
+			vecCreator = new SvmVectorCreator(stopwords);
 
 			treeFrags = new ArrayList<String>();
 			r = (FileResource) uc.getResourceObject("frags");
@@ -142,7 +143,7 @@ public class MipacqSvmChainCreator exten
 				}
 				vecCreator.setFrags(treeFrags);
 			}
-			logger.info("Stop words list loaded: " + r.getFile().getAbsolutePath());
+			logger.info("Tree fragment features loaded: " + r.getFile().getAbsolutePath());
 		} catch (Exception e) {
 			e.printStackTrace();
 			logger.error("Error loading stop words list");
@@ -154,16 +155,23 @@ public class MipacqSvmChainCreator exten
 		// Convert the orderless FSIterator to List, sort by char offsets
 		LinkedList<Annotation> lm = FSIteratorToList.convert(
 				jcas.getJFSIndexRepository().getAnnotationIndex(Markable.type).iterator());
-
+		Map<Markable, NonEmptyFSList> collectionRas = new HashMap<Markable, NonEmptyFSList>();
+		String docName = DocumentIDAnnotationUtil.getDocumentID(jcas);
+		logger.info("Classifying coreference in document: " + docName);
+//		ArrayList<CollectionTextRelation> chains = new ArrayList<CollectionTextRelation>();
+		int chainId = 0;
+		
+		EmptyFSList emptyList = new EmptyFSList(jcas);
 		// Create a parent pointer tree to calculate equivalence classes
-		ppt = new ParentPtrTree(lm.size());
+//		ppt = new ParentPtrTree(lm.size());
 
 		// Make a data structure mapping markables to indexes so we don't lose the order if we re-arrange
 		Map<Markable, Integer> m2q = new HashMap<Markable,Integer>();
+		
 		for(int p = 0; p < lm.size(); p++){
 			m2q.put((Markable)lm.get(p), p);
 		}
-		
+			
 		FSIterator<Annotation> iter = jcas.getAnnotationIndex(MarkablePairSet.type).iterator();
 		while(iter.hasNext()){
 			MarkablePairSet set = (MarkablePairSet) iter.next();
@@ -172,15 +180,23 @@ public class MipacqSvmChainCreator exten
 			MarkableProb bestAnte = null;
 			LinkedList<Markable> ll = fs2ll(fs);
 			if(anaphor instanceof PronounMarkable){
-				bestAnte = processPronoun(anaphor, ll, jcas);
+				// There is not enough training data to do this reliably... the
+				// classifier for this type will decrease scores
+//				bestAnte = processPronoun(anaphor, ll, jcas);
+				bestAnte = new MarkableProb(null, 0.0);
 			}else if(anaphor instanceof NEMarkable){
 				bestAnte = processNE(anaphor, ll, jcas);
 			}else if(anaphor instanceof DemMarkable){
 				bestAnte = processDem(anaphor, ll, jcas);
+			}else{
+				// should not happenn...
+				continue;
 			}
 
 			if(bestAnte.prob > CorefConsts.COREF_THRESHOLD){
+				// create the coref relation type
 				CoreferenceRelation cr = new CoreferenceRelation(jcas);
+				cr.setCategory("Coreference");
 				RelationArgument ra1 = new RelationArgument(jcas);
 				ra1.setId(bestAnte.m.getId());
 				ra1.setArgument(bestAnte.m.getContent());
@@ -195,42 +211,92 @@ public class MipacqSvmChainCreator exten
 				ra1.addToIndexes();
 				ra2.addToIndexes();
 				cr.addToIndexes();
-				ppt.union(m2q.get(anaphor), m2q.get(bestAnte.m));
+				
+				// propagate the collection relation type
+				RelationArgument anaRa = new RelationArgument(jcas);
+				anaRa.setId(anaphor.getId());
+				anaRa.setArgument(anaphor.getContent());
+				anaRa.setRole("mention");
+				NonEmptyFSList node = new NonEmptyFSList(jcas);
+				node.setHead(anaRa);
+				node.setTail(emptyList);
+				collectionRas.put(anaphor, node);
+				NonEmptyFSList anteNode = null;
+				if(collectionRas.containsKey(bestAnte.m)){
+					anteNode = collectionRas.get(bestAnte.m);
+					// find the end of the chain of this node
+					while(anteNode.getTail() instanceof NonEmptyFSList){
+						anteNode = (NonEmptyFSList) anteNode.getTail();
+					}
+				}else{
+					RelationArgument anteRa = new RelationArgument(jcas);
+					anteRa.setId(bestAnte.m.getId());
+					anteRa.setArgument(bestAnte.m.getContent());
+					anteRa.setRole("mention");
+					
+					anteNode = new NonEmptyFSList(jcas);
+					anteNode.setHead(anteRa);
+					collectionRas.put(bestAnte.m, anteNode);
+					CollectionTextRelation chain = new CollectionTextRelation(jcas);
+					chain.setId(chainId++);
+					chain.setCategory("CoreferenceChain");
+					chain.setMembers(anteNode);
+					chain.addToIndexes();
+				}
+				anteNode.setTail(node);
+				
+				
+//				ppt.union(m2q.get(anaphor), m2q.get(bestAnte.m));
 				if(anaphor instanceof PronounMarkable){
 					// if the anaphor is a pronoun then it won't be in the cas as an identifiedannotation so we need to add it.
 					IdentifiedAnnotation ia = new IdentifiedAnnotation(jcas);
-					
+					// TODO
 				}
 			}else{
 //				indexNegativeExample(jcas, bestAnte.m, anaphor, bestAnte.prob);
 			}
 		}
+		logger.info("Done classifying document: " + docName);
 
-		// Extract equivalence classes and save them into CAS
-		int[] ec = new int[ppt.getSize()]; // class number for each Markable
-		int n = ppt.equivCls(ec); // n holds the number of classes
-		EmptyFSList elist = new EmptyFSList(jcas); // shared tail for all chains
-		FSList[] listhds = new FSList[n]; // keep track of the heads of all chains
-		CollectionTextRelation[] chains = new CollectionTextRelation[n];
-
-		// Initialize n chains
-		for (int i = 0; i < n; ++i) {
-			chains[i] = new CollectionTextRelation(jcas);
-			chains[i].setId(i);
-			chains[i].setCategory("CoreferenceChain");
-			chains[i].addToIndexes();
-			listhds[i] = elist;
-		}
-
-		// Scan from the end of the Markable list
-		// insert Markables to the head of their chains
-		for (int i = ec.length-1; i >= 0; --i) {
-			NonEmptyFSList l = new NonEmptyFSList(jcas);
-			l.setHead(lm.get(i));
-			l.setTail(listhds[ec[i]]);
-			listhds[ec[i]] = l;
-			chains[ec[i]].setMembers(l);
-		}
+//		// Extract equivalence classes and save them into CAS
+//		int[] ec = new int[ppt.getSize()]; // class number for each Markable
+//		int n = ppt.equivCls(ec); // n holds the number of classes
+//		EmptyFSList elist = new EmptyFSList(jcas); // shared tail for all chains
+//		FSList[] listhds = new FSList[n]; // keep track of the heads of all chains
+//		CollectionTextRelation[] chains = new CollectionTextRelation[n];
+
+//		// Initialize n chains
+//		for (int i = 0; i < n; ++i) {
+//			chains[i] = null; //new CollectionTextRelation(jcas);
+////			chains[i].setId(i);
+////			chains[i].setCategory("CoreferenceChain");
+////			chains[i].addToIndexes();
+//			listhds[i] = elist;
+//		}
+
+//		// Scan from the end of the Markable list
+//		// insert Markables to the head of their chains
+//		for (int i = ec.length-1; i >= 0; --i) {
+//			if(m2ra.containsKey(lm.get(i))){
+//				NonEmptyFSList l = new NonEmptyFSList(jcas);
+//				l.setHead(m2ra.get(lm.get(i)));
+//				l.setTail(listhds[ec[i]]);
+//				listhds[ec[i]] = l;
+//				if(chains[ec[i]] == null){
+//					chains[ec[i]] = new CollectionTextRelation(jcas);
+//				}
+//				chains[ec[i]].setMembers(l);
+//			}
+//		}
+		
+//		int j = 0;
+//		for(int i = 0; i < n; i++){
+//			if(chains[i] != null){
+//				chains[i].setId(j++);
+//				chains[i].setCategory("CoreferenceChain");
+//				chains[i].addToIndexes();
+//			}
+//		}
 	}
 
 

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ODIEVectorFileWriter.java Mon Jan  7 22:49:52 2013
@@ -22,69 +22,56 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
-import java.io.IOException;
 import java.io.PrintWriter;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Hashtable;
 import java.util.LinkedList;
 import java.util.Scanner;
-import java.util.Vector;
-
-import org.apache.log4j.Logger;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.collection.CasConsumer_ImplBase;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.FSList;
-import org.apache.uima.jcas.cas.NonEmptyFSList;
-import org.apache.uima.jcas.cas.EmptyFSList;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceProcessException;
-import org.apache.uima.util.ProcessTrace;
-import org.apache.ctakes.coreference.type.BooleanLabeledFS;
 
+import libsvm.svm_node;
 
 import org.apache.ctakes.constituency.parser.treekernel.TreeExtractor;
 import org.apache.ctakes.constituency.parser.util.TreeUtils;
 import org.apache.ctakes.core.resource.FileLocator;
-import org.apache.ctakes.core.resource.FileResource;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
-import org.apache.ctakes.coreference.eval.helpers.Span;
-import org.apache.ctakes.coreference.eval.helpers.SpanAlignment;
-import org.apache.ctakes.coreference.eval.helpers.SpanOffsetComparator;
+import org.apache.ctakes.coreference.type.BooleanLabeledFS;
+import org.apache.ctakes.coreference.type.DemMarkable;
+import org.apache.ctakes.coreference.type.Markable;
+import org.apache.ctakes.coreference.type.MarkablePairSet;
+import org.apache.ctakes.coreference.type.NEMarkable;
 import org.apache.ctakes.coreference.util.CorefConsts;
 import org.apache.ctakes.coreference.util.FSIteratorToList;
 import org.apache.ctakes.coreference.util.GoldStandardLabeler;
 import org.apache.ctakes.coreference.util.MarkableTreeUtils;
 import org.apache.ctakes.coreference.util.PairAttributeCalculator;
-import org.apache.ctakes.coreference.util.ParentPtrTree;
-import org.apache.ctakes.coreference.util.SvmUtils;
 import org.apache.ctakes.coreference.util.SvmVectorCreator;
+import org.apache.ctakes.relationextractor.eval.XMIReader;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
 import org.apache.ctakes.utils.tree.SimpleTree;
-import org.apache.ctakes.coreference.type.MarkablePairSet;
-import org.apache.ctakes.coreference.type.Markable;
-import org.apache.ctakes.coreference.type.DemMarkable;
-import org.apache.ctakes.coreference.type.NEMarkable;
-import org.apache.ctakes.coreference.type.PronounMarkable;
-
-import libsvm.svm;
-import libsvm.svm_model;
-import libsvm.svm_node;
-import libsvm.svm_parameter;
-import libsvm.svm_problem;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
 
-public class ODIEVectorFileWriter extends CasConsumer_ImplBase {
+public class ODIEVectorFileWriter extends JCasAnnotator_ImplBase {
 
 	private Logger log = Logger.getLogger(this.getClass());
-	private static final Integer NGRAM_THRESHOLD = 0;
+//	private static final Integer NGRAM_THRESHOLD = 0;
 	private String outputDir = null;
 	private String goldStandardDir = null;
-	private PrintWriter anaphOut = null;
+//	private PrintWriter anaphOut = null;
 	private PrintWriter neOut = null;
 	private PrintWriter pronOut = null;
 	private PrintWriter demOut = null;
@@ -102,8 +89,8 @@ public class ODIEVectorFileWriter extend
 	private int posAnaphInst = 0;
 	private int negAnaphInst = 0;
 	//	private svm_problem anaphProb = null;
-	private ArrayList<Integer> anaphLabels = new ArrayList<Integer>();
-	private ArrayList<svm_node[]> anaphNodes = new ArrayList<svm_node[]>();
+//	private ArrayList<Integer> anaphLabels = new ArrayList<Integer>();
+//	private ArrayList<svm_node[]> anaphNodes = new ArrayList<svm_node[]>();
 //	private ArrayList<Integer> corefLabels = new ArrayList<Integer>();
 //	private ArrayList<svm_node[]> corefNodes = new ArrayList<svm_node[]>();
 	//	private ArrayList<TopTreebankNode> corefPathTrees = new ArrayList<TopTreebankNode>();
@@ -126,18 +113,26 @@ public class ODIEVectorFileWriter extend
 	//	private boolean printModels;
 	private boolean printVectors;
 	private boolean printTrees;
-	private boolean anaphora;
+//	private boolean anaphora;
 	private boolean useFrags = true; 							// make a parameter once development is done...
 
+	public static final String PARAM_OUTPUT_DIR = "outputDir";
+	public static final String PARAM_GOLD_DIR = "goldStandardDir";
+	public static final String PARAM_VECTORS = "writeVectors";
+	public static final String PARAM_TREES = "writeTrees";
+//	public static final String PARAM_ANAPH = "anaphora";
+	public static final String PARAM_FRAGS = "treeFrags";
+	public static final String PARAM_STOPS = "stopWords";
+	
 	@Override
-	public void initialize() throws ResourceInitializationException{
-		outputDir = (String) getConfigParameterValue("outputDir");
-		goldStandardDir = (String) getConfigParameterValue("goldStandardDir");
+	public void initialize(UimaContext aContext){
+		outputDir = (String) aContext.getConfigParameterValue(PARAM_OUTPUT_DIR);
+		goldStandardDir = (String) aContext.getConfigParameterValue(PARAM_GOLD_DIR);
 		//		printModels = (Boolean) getConfigParameterValue("writeModels");
-		printVectors = (Boolean) getConfigParameterValue("writeVectors");
-		printTrees = (Boolean) getConfigParameterValue("writeTrees");
+		printVectors = (Boolean) aContext.getConfigParameterValue(PARAM_VECTORS);
+		printTrees = (Boolean) aContext.getConfigParameterValue(PARAM_TREES);
 //		upSample = (Boolean) getConfigParameterValue("upSample");
-		anaphora = (Boolean) getConfigParameterValue("anaphora");
+//		anaphora = (Boolean) aContext.getConfigParameterValue(PARAM_ANAPH);
 
 		try{
 			// need to initialize parameters to default values (except where noted)
@@ -147,12 +142,12 @@ public class ODIEVectorFileWriter extend
 			proDir.mkdirs();
 			File demDir = new File(outputDir + "/" + CorefConsts.DEM + "/vectors/");
 			demDir.mkdirs();
-			if(printVectors){
-				if(anaphora) anaphOut = new PrintWriter(outputDir + "/anaphor.trainingvectors.libsvm");
+//			if(printVectors){
+//				if(anaphora) anaphOut = new PrintWriter(outputDir + "/anaphor.trainingvectors.libsvm");
 //				neOut = new PrintWriter(outputDir + "/" + CorefConsts.NE + "/training.libsvm");
 //				demOut = new PrintWriter(outputDir + "/" + CorefConsts.DEM + "/training.libsvm");
 //				pronOut = new PrintWriter(outputDir + "/" + CorefConsts.PRON + "/training.libsvm");
-			}
+//			}
 			if(printTrees){
 				neTreeOut = new PrintWriter(outputDir + "/" + CorefConsts.NE + "/trees.txt");
 				demTreeOut = new PrintWriter(outputDir + "/" + CorefConsts.DEM + "/trees.txt");
@@ -163,8 +158,9 @@ public class ODIEVectorFileWriter extend
 			//				pathTreeOut = new PrintWriter(outputDir + "/" + CorefConsts.NE + "/matrix.out");
 			//			}
 			stopwords = new HashSet<String>();
-			FileResource r = (FileResource) super.getUimaContext().getResourceObject("stopWords");
-			BufferedReader br = new BufferedReader(new FileReader(r.getFile()));
+//			FileResource r = (FileResource) aContext.getResourceObject("stopWords");
+			File stopFile = FileLocator.locateFile(((String)aContext.getConfigParameterValue(PARAM_STOPS)));
+			BufferedReader br = new BufferedReader(new FileReader(stopFile));
 			String l;
 			while ((l = br.readLine())!=null) {
 				l = l.trim();
@@ -175,11 +171,12 @@ public class ODIEVectorFileWriter extend
 				else if (i < 0)
 					stopwords.add(l.trim());
 			}
-			File anaphModFile = FileLocator.locateFile("anaphoricity.mayo.rbf.model");
-			svm_model anaphModel = svm.svm_load_model(anaphModFile.getAbsolutePath());
-			vecCreator = new SvmVectorCreator(stopwords, anaphModel);
-			r = (FileResource) super.getUimaContext().getResourceObject("treeFrags");
-			Scanner scanner = new Scanner(r.getFile());
+//			File anaphModFile = FileLocator.locateFile("anaphoricity.mayo.rbf.model");
+//			svm_model anaphModel = svm.svm_load_model(anaphModFile.getAbsolutePath());
+			vecCreator = new SvmVectorCreator(stopwords);
+//			r = (FileResource) aContext.getResourceObject("treeFrags");
+			File fragFile = FileLocator.locateFile(((String)aContext.getConfigParameterValue(PARAM_FRAGS)));
+			Scanner scanner = new Scanner(fragFile);
 			if(useFrags){
 				treeFrags = new ArrayList<String>();
 				while(scanner.hasNextLine()){
@@ -191,22 +188,21 @@ public class ODIEVectorFileWriter extend
 			initialized = true;
 		}catch(Exception e){
 			System.err.println("Error initializing file writers.");
-			throw new ResourceInitializationException();
 		}
 	}
 
 	@Override
-	public void processCas(CAS arg0) throws ResourceProcessException {
+	public void process(JCas jcas) {
 		//		System.err.println("processCas-ing");
 		if(!initialized) return;
-		JCas jcas;
-		try {
-			jcas = arg0.getCurrentView().getJCas();
-		} catch (CASException e) {
-			e.printStackTrace();
-			System.err.println("No processing done in ODIEVectoFileWriter!");
-			return;
-		}
+//		JCas jcas;
+//		try {
+//			jcas = arg0.getCurrentView().getJCas();
+//		} catch (CASException e) {
+//			e.printStackTrace();
+//			System.err.println("No processing done in ODIEVectoFileWriter!");
+//			return;
+//		}
 
 		String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
 		docId = docId.substring(docId.lastIndexOf('/')+1, docId.length());
@@ -214,7 +210,7 @@ public class ODIEVectorFileWriter extend
 //		Hashtable<Integer, Integer> goldId2AlignId = new Hashtable<Integer, Integer>();
 //		Hashtable<Integer, Integer> alignId2GoldId = new Hashtable<Integer, Integer>();
 		if (docId==null) docId = "141471681_1";
-		System.out.print("creating vectors for "+docId);
+		System.out.println("creating vectors for "+docId);
 //		Vector<Span> goldSpans = loadGoldStandard(docId, goldSpan2id);
 		int numPos = 0;
 
@@ -277,13 +273,13 @@ public class ODIEVectorFileWriter extend
 				NonEmptyFSList node = (NonEmptyFSList) pairList;
 				BooleanLabeledFS labeledProb = (BooleanLabeledFS) node.getHead();
 				int label = labeledProb.getLabel() ? 1 : 0;
-				if(anaphora){
-					if(label == 1) posAnaphInst++;
-					else negAnaphInst++;
-					anaphLabels.add(label);
-					svm_node[] nodes = vecCreator.createAnaphoricityVector(anaphor, jcas);
-					anaphNodes.add(nodes);
-				}
+//				if(anaphora){
+//					if(label == 1) posAnaphInst++;
+//					else negAnaphInst++;
+//					anaphLabels.add(label);
+//					svm_node[] nodes = vecCreator.createAnaphoricityVector(anaphor, jcas);
+//					anaphNodes.add(nodes);
+//				}
 				Markable antecedent = (Markable) labeledProb.getFeature();
 				label = (labeler.isGoldPair(anaphor, antecedent) ? 1 : 0);
 				if(label == 1){
@@ -361,7 +357,11 @@ public class ODIEVectorFileWriter extend
 					writer.println(" |ET|");
 				}
 				pairList = node.getTail();
-				if(label == 1) break;
+				// NOTE: If this is in place, then we will only output negative examples backwards until we reach
+				// the actual coreferent entity.  This may have the effect of suggesting that further away markables
+				// are _more_ likely to be coreferent, which is an assumption that probably does not hold up in the
+				// test set configuration.  Try commenting this feature out to see if it makes the feature more useful.
+//				if(label == 1) break;
 			}
 		}
 		if(printVectors){
@@ -378,38 +378,39 @@ public class ODIEVectorFileWriter extend
 		return Integer.parseInt(nodeStr.substring(0,1));
 	}
 
+	
 	@Override
-	public void collectionProcessComplete(ProcessTrace arg0)
-	throws ResourceProcessException, IOException {
-		super.collectionProcessComplete(arg0);
+	public void batchProcessComplete() throws AnalysisEngineProcessException {
+		super.batchProcessComplete();
+
 		//		System.err.println("collectionProcessComplete!");
 		if(!initialized) return;
 
 //		int numPos = 1;
 //		int numNeg = 1;
-
-		if(anaphora){
-			double anaphRatio = (double) posAnaphInst / (double) negAnaphInst;
-//			if(anaphRatio > 1.0) numNeg = (int) anaphRatio;
-//			else numPos = (int) (1 / anaphRatio);
-			for(int i = 0; i < anaphNodes.size(); i++){
-				int label = anaphLabels.get(i);
-//				int numIters = (label == 1 ? numPos : numNeg);
-//				for(int j = 0; j < numIters; j++){
-					anaphOut.print(label);
-					for(svm_node node : anaphNodes.get(i)){
-						anaphOut.print(" ");
-						anaphOut.print(node.index);
-						anaphOut.print(":");
-						anaphOut.print(node.value);
-					}
-					anaphOut.println();
-//				}
-			}
-			anaphOut.flush();
-			anaphOut.close();
-			return;
-		}
+//
+//		if(anaphora){
+//			double anaphRatio = (double) posAnaphInst / (double) negAnaphInst;
+////			if(anaphRatio > 1.0) numNeg = (int) anaphRatio;
+////			else numPos = (int) (1 / anaphRatio);
+//			for(int i = 0; i < anaphNodes.size(); i++){
+//				int label = anaphLabels.get(i);
+////				int numIters = (label == 1 ? numPos : numNeg);
+////				for(int j = 0; j < numIters; j++){
+//					anaphOut.print(label);
+//					for(svm_node node : anaphNodes.get(i)){
+//						anaphOut.print(" ");
+//						anaphOut.print(node.index);
+//						anaphOut.print(":");
+//						anaphOut.print(node.value);
+//					}
+//					anaphOut.println();
+////				}
+//			}
+//			anaphOut.flush();
+//			anaphOut.close();
+//			return;
+//		}
 		if(printVectors){
 			neOut.close();
 			demOut.close();
@@ -433,4 +434,48 @@ public class ODIEVectorFileWriter extend
 		}
 		return array;
 	}
+	
+	public static void main(String[] args){
+		if(args.length < 3){
+			System.err.println("Arguments: <training directory> <gold-pairs directory> <output directory>");
+			System.exit(-1);
+		}
+		File xmiDir = new File(args[0]);
+		if(!xmiDir.isDirectory()){
+			System.err.println("Arg1 should be a directory! (full of xmi files)");
+			System.exit(-1);
+		}
+		File[] files = xmiDir.listFiles();
+//		ArrayList<File> fileList = new ArrayList<File>();
+		String[] paths = new String[files.length];
+		for(int i = 0; i < files.length; i++){
+//			fileList.add(files[i]);
+			paths[i] = files[i].getAbsolutePath();
+		}
+//		TypeSystemDescription typeSystem = 
+//			TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../ctakes-type-system/desc/common_type_system.xml", 
+//																			 "desc/type-system/CorefTypes.xml",
+//																			 "../assertion/desc/medfactsTypeSystem.xml");
+//		TypeSystemDescription corefTypeSystem = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath();
+		try {
+			CollectionReader xmiReader = CollectionReaderFactory.createCollectionReader(XMIReader.class, 
+//					typeSystem, 
+					XMIReader.PARAM_FILES, 
+					paths);
+			
+			AnalysisEngine consumer = AnalysisEngineFactory.createPrimitive(ODIEVectorFileWriter.class,
+//					typeSystem,
+					ODIEVectorFileWriter.PARAM_VECTORS, true,
+					ODIEVectorFileWriter.PARAM_TREES, false,
+					ODIEVectorFileWriter.PARAM_STOPS, "org/apache/ctakes/coreference/models/stop.txt",
+					ODIEVectorFileWriter.PARAM_FRAGS, "org/apache/ctakes/coreference/models/frags.txt",
+					ODIEVectorFileWriter.PARAM_GOLD_DIR, args[1],
+					ODIEVectorFileWriter.PARAM_OUTPUT_DIR, args[2]);
+					
+			SimplePipeline.runPipeline(xmiReader, consumer);
+		}catch(Exception e){
+			System.err.println("Exception thrown!");
+			e.printStackTrace();
+		}
+	}
 }

Added: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java?rev=1430073&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java (added)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java Mon Jan  7 22:49:52 2013
@@ -0,0 +1,129 @@
+package org.apache.ctakes.coreference.cc;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.uima.UIMAException;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASRuntimeException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.Options_ImplBase;
+import org.kohsuke.args4j.Option;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class PreprocessAndWriteXmi {
+	public static class Options extends Options_ImplBase {
+
+		@Option(name = "-t", 
+				aliases = "--textRoot", 
+				usage = "specify the directory contraining the textFiles (for example /NLP/Corpus/Relations/mipacq/text/train",
+				required = true)
+				public String textRoot;
+
+		// TODO - fix to use an xml collection reader instead of the hacky way it's done now...
+		//		@Option(name = "-x",
+		//				aliases = "--xmlRoot",
+		//				usage = "specify the directory containing the knowtator xml files (for example: /NLP/Corpus/Relations/mipacq/xml/train",
+		//        required = true)
+		//		public File xmlRoot;
+
+		@Option(name = "-o",
+				aliases = "--outputRoot",
+				usage = "specify the directory to write out CAS XMI files",
+				required = true)
+				public File outputRoot;
+	}
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws UIMAException 
+	 */
+	public static void main(String[] args) throws UIMAException, IOException {
+		Options options = new Options();
+		options.parseOptions(args);
+
+		File outputRoot = options.outputRoot;
+		String inputRoot = options.textRoot;
+//		TypeSystemDescription typeSystem = 
+//			TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../common-type-system/desc/common_type_system.xml", 
+//																			 "../assertion/desc/medfactsTypeSystem.xml");
+
+		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("desc/analysis_engine/ODIESvmVectorCreator.xml");
+
+		CollectionReader reader = CollectionReaderFactory.createCollectionReaderFromPath(
+				"../ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml",
+				FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+				inputRoot);
+
+		AnalysisEngine serializer = AnalysisEngineFactory.createPrimitive(
+				PreprocessAndWriteXmi.SerializeDocumentToXMI.class,
+//				typeSystem,
+				PreprocessAndWriteXmi.SerializeDocumentToXMI.PARAM_OUTPUT_DIRECTORY, 
+				outputRoot.getPath());
+
+		SimplePipeline.runPipeline(reader, ae, serializer);	    
+	}
+
+	public static class SerializeDocumentToXMI extends JCasAnnotator_ImplBase {
+		public static final String PARAM_OUTPUT_DIRECTORY = ConfigurationParameterFactory
+		.createConfigurationParameterName(SerializeDocumentToXMI.class, "outputDirectory");
+
+		@ConfigurationParameter(mandatory = true, description = "Specifies the output directory in which to write xmi files")
+		private File outputDirectory;
+
+		@Override
+		public void initialize(UimaContext context) throws ResourceInitializationException {
+			super.initialize(context);
+			if (!this.outputDirectory.exists()) {
+				this.outputDirectory.mkdirs();
+			}
+		}
+
+		@Override
+		public void process(JCas jCas) throws AnalysisEngineProcessException {
+			try {
+				// FIXME - not using this right now, just use default jcas
+//				JCas goldView = jCas.getView(RelationExtractorEvaluation.GOLD_VIEW_NAME);
+				JCas goldView = jCas;
+				String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView);
+				if (documentID == null) {
+					throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
+				}
+				File outFile = new File(this.outputDirectory, documentID + ".xmi");
+				ContentHandler handler = new XMLSerializer(new FileOutputStream(outFile)).getContentHandler();
+				new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
+			} catch (CASRuntimeException e) {
+				throw new AnalysisEngineProcessException(e);
+			} catch (SAXException e) {
+				throw new AnalysisEngineProcessException(e);
+			} catch (FileNotFoundException e) {
+				throw new AnalysisEngineProcessException(e);
+//			} catch (CASException e) {
+//				throw new AnalysisEngineProcessException(e);
+			}	
+		}
+
+	}
+
+}
+

Propchange: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java
------------------------------------------------------------------------------
    svn:executable = *

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/FeatureVector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/FeatureVector.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/FeatureVector.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/FeatureVector.java Mon Jan  7 22:49:52 2013
@@ -39,72 +39,66 @@ public class FeatureVector {
 	};
 
 	final static String[] ne_coref_feats = {
-			"SameSection",  // {yes, no}	 1
-			"TokenDistance"   , // numeric    
+			"TokenDistance"   , // numeric   1    
 			"SentenceDistance", // numeric
 			"ExactMatch"      , // {yes,no}
 			"StartMatch"      , // {yes,no}
-//			"MidMatch"        , // {yes,no}
 			"EndMatch"        , // {yes,no}
-//			"StringMatch"	  , // {yes,no}
 			"SoonStr"         , // {C,I}
 			"Pronoun1"        , // {Y,N}
 			"Pronoun2"        , // {Y,N}
-			"Definite2"       , // {Y,N}      10
-			"Demonstrative2"  , // {Y,N}      
-			//"NumberMatch"     , // {C,I,NA}
+			"Definite2"       , // {Y,N}      
+			"Demonstrative2"  , // {Y,N}      10
 			"NumberMatchC"    , // {Y,N}
 			"NumberMatchI"    , // {Y,N}
 			"NumberMatchNA"   , // {Y,N}
-			//"WnClass"         , // {C,I,NA}
 			"WnClassC"        , // {Y,N}
 			"WnClassI"        , // {Y,N}
 			"WnClassNA"       , // {Y,N}
 			"Alias"           , // {C,I}
 			"ProStr"          , // {C,I}
-			"SoonStrNonpro"   , // {C,I}      20
-			"WordOverlap"     , // {C,I}
+			"SoonStrNonpro"   , // {C,I}      
+			"WordOverlap"     , // {C,I}      20
 			"WordsSubstr"     , // {C,I}
-			//"BothDefinites"   , // {C,I,NA}
 			"BothDefinitesC"  , // {Y,N}
 			"BothDefinitesI"  , // {Y,N}
 			"BothDefinitesNA" , // {Y,N}
-			//"BothEmbedded"    , // {C,I,NA}
-			"BothEmbeddedC"   , // {Y,N}
-			"BothEmbeddedI"   , // {Y,N}
-			"BothEmbeddedNA"  , // {Y,N}
-			//"BothPronouns"    , // {C,I,NA}
 			"BothPronounsC"   , // {Y,N}
-			"BothPronounsI"   , // {Y,N}   30
+			"BothPronounsI"   , // {Y,N}   
 			"BothPronounsNA"  , // {Y,N}
 			"Indefinite"      , // {I,C}
 			"Pronoun"         , // {I,C}
-			"Definite1"       , // {Y,N}
+			"Definite1"       , // {Y,N}      30
 			"ClosestComp"     , // {C,I}
 			"IsDrug"           , // {Y,N}
 			"IsDisorder"       , // {Y,N}
 			"IsFinding"        , // {Y,N}
 			"IsProcedure"      , // {Y,N}
-			"IsAnatomicalSite" , // {Y,N}   40
+			"IsAnatomicalSite" , // {Y,N}   
 			"NPHead"          , // {yes, no}
 //			"Anaph"           , // numeric
 //			"PermStrDist"	  , //             
 			"PathLength"	  , // number of nodes in full path 37
 			"NPunderVP1"	  , // NP object?
-			"NPunderVP2"	  , //
+			"NPunderVP2"	  , //            40
 			"NPunderS1"		  , // NP subject?
 			"NPunderS2"       , //             
 			"NPunderPP1"	  , // PP object?  
 			"NPunderPP2"      , //             
 			"NPSubj1"		  , //			   
-			"NPSubj2"		  , //             50 
-			"NPSubjBoth"	  , //			   
-//			"NegatedBoth"	  , //
-//			"NonNegatedBoth"  ,
-			//"NPSubjBoth"    , //             
-//			"Cat:Ngrams"      , //			   :n-1
-//			"TK"			  ,
-//			"WordsStr"			// not used, why?		 
+			"NPSubj2"		  , //              
+			"NPSubjBoth"	  , //			
+			"WikiSim"		  ,
+//			"EntityWikiSim"   ,
+//			"SimSum"          , //            50
+			"AliasDrug"       ,
+			"AliasDisorder"   ,
+			"AliasFinding"    ,
+			"AliasProcedure"  ,
+			"AliasAnatomy"    ,
+			"EntityStartMatch",
+			"EntityExactMatch",
+			"EntityEndMatch",
 	};
 
 	final static String[] pron_coref_feats = ne_coref_feats;

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/PairAttributeCalculator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/PairAttributeCalculator.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/PairAttributeCalculator.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/PairAttributeCalculator.java Mon Jan  7 22:49:52 2013
@@ -53,17 +53,23 @@ import org.apache.ctakes.coreference.typ
 public class PairAttributeCalculator extends AttributeCalculator {
 
 	protected Markable m1, m2;
-	protected String s1, s2;
+    protected String ms1, ms2; // markable strings
+    protected String es1, es2; // entity strings
+//	protected String s1, s2;
 	protected Annotation a1, a2;
-
+	boolean alias;
+	
 	public PairAttributeCalculator (JCas jcas, Markable m1, Markable m2) {
 		super(jcas);
 		this.m1 = m1;
 		this.m2 = m2;
 		this.a1 = m1.getContent();
 		this.a2 = m2.getContent();
-		s1 = m1.getCoveredText();
-		s2 = m2.getCoveredText();
+		ms1 = m1.getCoveredText();
+		ms2 = m2.getCoveredText();
+		es1 = a1.getCoveredText();
+		es2 = a2.getCoveredText();
+		alias = isAlias();
 	}
 	
 	/**
@@ -107,39 +113,51 @@ public class PairAttributeCalculator ext
 		return AnnotationCounter.countPoint(AnnotationSelector.selectSentence(jcas), m1.getEnd(), m2.getBegin());
 	}
 
-	public String calcExactMatch () {
-		return s1.equalsIgnoreCase(s2) ? "yes" : "no";
+	public boolean calcExactMatch () {
+		return ms1.equalsIgnoreCase(ms2);
 	}
 
-	public String calcStartMatch () {
-		return TextMatch.startMatch(s1, s2) ? "yes" : "no";
+	public boolean calcStartMatch () {
+		return TextMatch.startMatch(ms1, ms2);
 	}
 
-	public String calcMidMatch () {
-		return "no";
+	public boolean calcMidMatch () {
+		return false;
 	}
 
-	public String calcEndMatch () {
-		return TextMatch.endMatch(s1, s2) ? "yes" : "no";
+	public boolean calcEndMatch () {
+		return TextMatch.endMatch(ms1, ms2);
 	}
 
-	public String calcStringMatch() {
-		return ( calcExactMatch().equals("yes") || calcStartMatch().equals("yes") || calcEndMatch().equals("yes") ? "yes" : "no");
+	public boolean calcStringMatch() {
+		return (calcExactMatch() || calcStartMatch() || calcEndMatch());
 	}
 	
-	public String calcSoonStr () {
-		String sl1 = s1.toLowerCase();
-		String sl2 = s2.toLowerCase();
+    public boolean calcEntityExactMatch() {
+        return es1.equalsIgnoreCase(es2);
+    }
+
+    public boolean calcEntityStartMatch() {
+    	return TextMatch.startMatch(es1, es2);
+    }
+
+    public boolean calcEntityEndMatch(){
+    	return TextMatch.endMatch(es1, es2);
+    }
+
+	public boolean calcSoonStr () {
+		String sl1 = ms1.toLowerCase();
+		String sl2 = ms2.toLowerCase();
 //		if (sl1.startsWith("the ")) sl1 = sl1.substring(4);
 //		if (sl1.startsWith("a ")) sl1 = sl1.substring(2);
 //		if (sl2.startsWith("the ")) sl2 = sl2.substring(4);
 //		if (sl2.startsWith("a ")) sl2 = sl2.substring(2);
 		sl1 = nonDetSubstr(sl1);
 		sl2 = nonDetSubstr(sl2);
-		return sl1.equals(sl2) ? "C" : "I";
+		return sl1.equals(sl2);
 	}
 	
-	private String nonDetSubstr (String s) {
+	private static String nonDetSubstr (String s) {
 		if(s.startsWith("the ")) return s.substring(4);
 		if(s.startsWith("a ")) return s.substring(2);
 		if(s.startsWith("this ")) return s.substring(5);
@@ -147,58 +165,59 @@ public class PairAttributeCalculator ext
 		return s;
 	}
 
-	public String calcPronoun1 () {
-		return isPronoun(m1) ? "Y" : "N";
+	public boolean calcPronoun1 () {
+		return isPronoun(m1);
 	}
 
-	public String calcPronoun2 () {
-		return isPronoun(m2) ? "Y" : "N";
+	public boolean calcPronoun2 () {
+		return isPronoun(m2);
 	}
 
-	public String calcDefinite2 () {
-		return isDefinite(s2) ? "Y" : "N";
+	public boolean calcDefinite2 () {
+		return isDefinite(ms2);
 	}
 
-	public String calcDemonstrative2 () {
-		return isDemonstrative(s2) ? "Y" : "N";
+	public boolean calcDemonstrative2 () {
+		return isDemonstrative(ms2);
 	}
 
-	public String calcNumberMatchC () {
+	public boolean calcNumberMatchC () {
 		String n1 = number(m1);
 		String n2 = number(m2);
-		if (!n1.equals("U") && !n2.equals("U") && n1.equals(n2))
-			return "Y";
-		else
-			return "N";
+		if (!n1.equals("U") && !n2.equals("U") && n1.equals(n2)){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcNumberMatchI () {
+	public boolean calcNumberMatchI () {
 		String n1 = number(m1);
 		String n2 = number(m2);
-		if (!n1.equals("U") && !n2.equals("U") && !n1.equals(n2))
-			return "Y";
-		else
-			return "N";
+		if (!n1.equals("U") && !n2.equals("U") && !n1.equals(n2)){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcNumberMatchNA () {
+	public boolean calcNumberMatchNA () {
 		String n1 = number(m1);
 		String n2 = number(m2);
-		if (n1.equals("U") || n2.equals("U"))
-			return "Y";
-		else return "N";
+		if (n1.equals("U") || n2.equals("U")){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcNumberMatch () {
-		String n1 = number(m1);
-		String n2 = number(m2);
-		if (n1.equals("U") || n2.equals("U"))
-			return "NA";
-		else if (n1.equals(n2))
-			return "C";
-		else
-			return "I";
-	}
+//	public String calcNumberMatch () {
+//		String n1 = number(m1);
+//		String n2 = number(m2);
+//		if (n1.equals("U") || n2.equals("U"))
+//			return "NA";
+//		else if (n1.equals(n2))
+//			return "C";
+//		else
+//			return "I";
+//	}
 
 	// heuristics
 	//	public String calcAppositive () {
@@ -208,51 +227,57 @@ public class PairAttributeCalculator ext
 	//		else return "no";
 	//	}
 
-	public String calcWnClassC () {
+	public boolean calcWnClassC () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
 				m2.getContent() instanceof IdentifiedAnnotation) {
-				IdentifiedAnnotation ne1 = (IdentifiedAnnotation) m1.getContent();
-				IdentifiedAnnotation ne2 = (IdentifiedAnnotation) m2.getContent();
-				if (ne1.getTypeID() == ne2.getTypeID())
-					return "C";
-				else return "N";
-			} else
-				return "N";
+			IdentifiedAnnotation ne1 = (IdentifiedAnnotation) m1.getContent();
+			IdentifiedAnnotation ne2 = (IdentifiedAnnotation) m2.getContent();
+			if (ne1.getTypeID() == ne2.getTypeID()){
+				return true;
+			}
+			return false;
+		}
+		return false;
 	}
 
-	public String calcWnClassI () {
+	public boolean calcWnClassI () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
 				m2.getContent() instanceof IdentifiedAnnotation) {
-				IdentifiedAnnotation ne1 = (IdentifiedAnnotation) m1.getContent();
-				IdentifiedAnnotation ne2 = (IdentifiedAnnotation) m2.getContent();
-				if (ne1.getTypeID() != ne2.getTypeID())
-					return "Y";
-				else return "N";
-			} else
-				return "N";
+			IdentifiedAnnotation ne1 = (IdentifiedAnnotation) m1.getContent();
+			IdentifiedAnnotation ne2 = (IdentifiedAnnotation) m2.getContent();
+			if (ne1.getTypeID() != ne2.getTypeID()){
+				return true;
+			}
+			return false;
+		}
+		return false;
 	}
 
-	public String calcWnClassNA () {
+	public boolean calcWnClassNA () {
 		if (!(m1.getContent() instanceof IdentifiedAnnotation) ||
-				!(m2.getContent() instanceof IdentifiedAnnotation))
-			return "Y";
-		else
-			return "N";
+				!(m2.getContent() instanceof IdentifiedAnnotation)){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcWnClass () {
+	public boolean calcWnClass () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-			m2.getContent() instanceof IdentifiedAnnotation) {
+				m2.getContent() instanceof IdentifiedAnnotation) {
 			IdentifiedAnnotation ne1 = (IdentifiedAnnotation) m1.getContent();
 			IdentifiedAnnotation ne2 = (IdentifiedAnnotation) m2.getContent();
-			if (ne1.getTypeID() == ne2.getTypeID())
-				return "C";
-			else return "I";
-		} else
-			return "NA";
+			if (ne1.getTypeID() == ne2.getTypeID()){
+				return true;
+			}
+		}
+		return false;
 	}
 
-	public String calcAlias () {
+	public boolean calcAlias () {
+		return alias;
+	}
+	
+	public boolean isAlias(){
 		try{
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
 			m2.getContent() instanceof IdentifiedAnnotation) {
@@ -269,22 +294,22 @@ public class PairAttributeCalculator ext
 			for (int i = 0; i < fsa.size(); ++i)
 				if (fsa.get(i) instanceof UmlsConcept &&
 					l.contains(((UmlsConcept)fsa.get(i)).getCui()))
-					return "C";
+					return true;
 		}
 		}catch(Exception e){
 			System.err.println("Error here!");
 		}
-		return "I";
+		return false;
 	}
 	
 	// PRO_STR in Ng and Cardie
-	public String calcProStr () {
+	public boolean calcProStr () {
 		if (isPronominal(m1) &&
 			isPronominal(m2) &&
-			s1.equalsIgnoreCase(s2))
-			return "C";
-		else
-			return "I";
+			ms1.equalsIgnoreCase(ms2)){
+			return true;
+		}
+		return false;
 	}
 
 //	public String calcPnStr () {
@@ -298,15 +323,15 @@ public class PairAttributeCalculator ext
 //	}
 
 	// WORDS_STR in Ng and Cardie - currently not used
-	public String calcWordsStr () {
+	public boolean calcWordsStr () {
 		if (!isPronominal(m1) && !isPronominal(m2) &&
-			s1.equalsIgnoreCase(s2))
-			return "C";
-		else
-			return "I";
+			ms1.equalsIgnoreCase(ms2)){
+			return true;
+		}
+		return false;
 	}
 
-	private String removeArticleAndDemon (String s) {
+	private static String removeArticleAndDemon(String s){
 		if (s.toLowerCase().startsWith("a "))
 			return s.substring(2);
 		else if (s.toLowerCase().startsWith("an "))
@@ -326,32 +351,35 @@ public class PairAttributeCalculator ext
 	}
 
 	// SOON_STR_NONPRO from Ng and Cardie
-	public String calcSoonStrNonpro () {
+	public boolean calcSoonStrNonpro () {
 		if (!isPronominal(m1) && !isPronominal(m2)) {
-			String str1 = removeArticleAndDemon(s1);
-			String str2 = removeArticleAndDemon(s2);
+			String str1 = removeArticleAndDemon(ms1);
+			String str2 = removeArticleAndDemon(ms2);
 			if (str1.toLowerCase().indexOf(str2.toLowerCase()) >= 0 ||
-				str2.toLowerCase().indexOf(str1.toLowerCase()) >= 0)
-				return "C";
+				str2.toLowerCase().indexOf(str1.toLowerCase()) >= 0){
+				return true;
+			}
 		}
-		return "I";
+		return false;
 	}
 
 
 	// WORD_OVERLAP from Ng and Cardie 02
-	public String calcWordOverlap () {
+	public boolean calcWordOverlap () {
 		ArrayList<String> t1 = contentWords(m1);
 		ArrayList<String> t2 = contentWords(m2);
-		for (String s : t2)
-			if (t1.contains(s))
-				return "C";
-		return "I";
+		for (String s : t2){
+			if (t1.contains(s)){
+				return true;
+			}
+		}
+		return false;
 	}
 
 	// TODO with syntax
 	// MODIFIER from Ng and Cardie 02
-	public String calcModifier () {
-		return "yes"; 
+	public boolean calcModifier () {
+		return true; 
 	}
 
 //	public String calcPnSubstr () {
@@ -360,7 +388,7 @@ public class PairAttributeCalculator ext
 
 	// is l1 a proper substring of l2?
 	// TODO optimize with Stringbuffer instead of concatenation
-	private boolean isProperSubstring (ArrayList<String> l1, ArrayList<String> l2) {
+	private static boolean isProperSubstring (ArrayList<String> l1, ArrayList<String> l2) {
 		String str1 = "";
 		String str2 = "";
 		for (String s : l1)
@@ -368,84 +396,84 @@ public class PairAttributeCalculator ext
 		for (String s: l2)
 			str2 += " " + s;
 		// FIXME This should be an AND ?
-		if (str1.length()!=str2.length() || str2.indexOf(str1)>=0)
+		if (str1.length()!=str2.length() || str2.indexOf(str1)>=0){
 			return true;
-		else
-			return false;
+		}
+		return false;
 	}
 
-	public String calcWordsSubstr () {
+	public boolean calcWordsSubstr () {
 		if (!isPronominal(m1) && !isPronominal(m2)) {
 			ArrayList<String> t1 = contentWords(m1);
 			ArrayList<String> t2 = contentWords(m2);
-			if (isProperSubstring(t1, t2) || isProperSubstring(t2, t1))
-				return "C";
+			if (isProperSubstring(t1, t2) || isProperSubstring(t2, t1)){
+				return true;
+			}
 		}
-		return "I";
-	}
-
-	public String calcBothDefinitesC () {
-		return (isDefinite(s1) && isDefinite(s2)) ? "Y" : "N";
-	}
-
-	public String calcBothDefinitesI () {
-		return (!isDefinite(s1) && !isDefinite(s2)) ? "Y" : "N";
+		return false;
 	}
 
-	public String calcBothDefinitesNA () {
-		boolean b1 = isDefinite(s1);
-		boolean b2 = isDefinite(s2);
-		return (!(b1&&b2) && (b1||b2)) ? "Y" : "N";
+	public boolean calcBothDefinitesC () {
+		return (isDefinite(ms1) && isDefinite(ms2));
 	}
 
-	public String calcBothDefinites () {
-		boolean b1 = isDefinite(s1);
-		boolean b2 = isDefinite(s2);
-		if (b1 && b2) return "C";
-		if (b1 || b2) return "NA";
-		return "I";
+	public boolean calcBothDefinitesI () {
+		return (!isDefinite(ms1) && !isDefinite(ms2));
 	}
 
-	public String calcBothEmbeddedC () {
-		return "N"; //TODO: sketch
+	public boolean calcBothDefinitesNA () {
+		boolean b1 = isDefinite(ms1);
+		boolean b2 = isDefinite(ms2);
+		return (!(b1&&b2) && (b1||b2));
 	}
 
-	public String calcBothEmbeddedI () {
-		return "N"; //TODO: sketch
-	}
-
-	public String calcBothEmbeddedNA () {
-		return "N"; //TODO: sketch
-	}
+//	public String calcBothDefinites () {
+//		boolean b1 = isDefinite(ms1);
+//		boolean b2 = isDefinite(ms2);
+//		if (b1 && b2) return "C";
+//		if (b1 || b2) return "NA";
+//		return "I";
+//	}
 
-	public String calcBothEmbedded () {
-		return "NA"; //TODO: sketch
-	}
+//	public String calcBothEmbeddedC () {
+//		return "N"; //TODO: sketch
+//	}
+//
+//	public String calcBothEmbeddedI () {
+//		return "N"; //TODO: sketch
+//	}
+//
+//	public String calcBothEmbeddedNA () {
+//		return "N"; //TODO: sketch
+//	}
+//
+//	public String calcBothEmbedded () {
+//		return "NA"; //TODO: sketch
+//	}
 
-	public String calcBothPronounsC () {
+	public boolean calcBothPronounsC () {
 		boolean b1 = isPronoun(m1);
 		boolean b2 = isPronoun(m2);
-		return (b1 && b2) ? "Y" : "N";
+		return (b1 && b2);
 	}
 
-	public String calcBothPronounsI () {
+	public boolean calcBothPronounsI () {
 		boolean b1 = isPronoun(m1);
 		boolean b2 = isPronoun(m2);
-		return (!b1 && !b2) ? "Y" : "N";
+		return (!b1 && !b2);
 	}
 
-	public String calcBothPronounsNA () {
+	public boolean calcBothPronounsNA () {
 		boolean b1 = isPronoun(m1);
 		boolean b2 = isPronoun(m2);
-		return (!(b1&&b2) && (b1||b2)) ? "Y" : "N";
+		return (!(b1&&b2) && (b1||b2));
 	}
 
-	public String calcBothPronouns () {
+	public boolean calcBothPronouns () {
 		boolean b1 = isPronoun(m1);
 		boolean b2 = isPronoun(m2);
-		if (b1 && b2) return "C";
-		if (b1 || b2) return "NA";
-		return "I";
+		if (b1 && b2) return true;
+		return false;
 	}
 
 //	public String calcSpan () {
@@ -458,121 +486,115 @@ public class PairAttributeCalculator ext
 //		}
 //	}
 
-	public String calcIndefinite () {
-		if (s2.toLowerCase().startsWith("a ") ||
-			s2.toLowerCase().startsWith("an "))
-			return "I";
-		else
-			return "C";
+	public boolean calcIndefinite () {
+		if (ms2.toLowerCase().startsWith("a ") ||
+			ms2.toLowerCase().startsWith("an ")){
+			return false;
+		}
+		return true;
 	}
 
-	public String calcPronoun () {
-		 return (isPronoun(m1) && !isPronoun(m2)) ? "I" : "C";
+	public boolean calcPronoun () {
+		 return !(isPronoun(m1) && !isPronoun(m2));
 	}
 
 //	public String calcContainsPn () {
 //		
 //	}
 
-	public String calcDefinite1 () {
-		return isDefinite(s1)?"Y":"N";
+	public boolean calcDefinite1 () {
+		return isDefinite(ms1);
 	}
 
 //	public String calcProperNoun () {
 //		
 //	}
 
-	public String calcIsDrug () {
+	public boolean calcIsDrug () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_DRUG)
-			return "Y";
-		else
-			return "N";
+				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_DRUG){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcIsDisorder () {
+	public boolean calcIsDisorder () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_DISORDER)
-			return "Y";
-		else
-			return "N";
+				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_DISORDER){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcIsFinding () {
+	public boolean calcIsFinding () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_FINDING)
-			return "Y";
-		else
-			return "N";
+				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_FINDING){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcIsProcedure () {
+	public boolean calcIsProcedure () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_PROCEDURE)
-			return "Y";
-		else
-			return "N";
+				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_PROCEDURE){
+			return true;
+		}
+		return false;
 	}
 
-	public String calcIsAnatomicalSite () {
+	public boolean calcIsAnatomicalSite () {
 		if (m1.getContent() instanceof IdentifiedAnnotation &&
-				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_ANATOMICAL_SITE)
-			return "Y";
-		else
-			return "N";
+				((IdentifiedAnnotation)m1.getContent()).getTypeID() == CONST.NE_TYPE_ID_ANATOMICAL_SITE){
+			return true;
+		}
+		return false;
 	}
 
-	public double calcNegatedBoth(){
+	public boolean calcNegatedBoth(){
 		if(a1 instanceof EntityMention && a2 instanceof EntityMention){
 			if(((EntityMention)a1).getPolarity() == -1 &&
 			   ((EntityMention)a2).getPolarity() == -1){
-				return 1.0;
-			}else{
-				return 0.0;
+				return true;
 			}
-		}else{
-			return 0.0;
 		}
+		return false;
 	}
 	
-	public double calcNonNegatedBoth(){
+	public boolean calcNonNegatedBoth(){
 		if(a1 instanceof EntityMention && a2 instanceof EntityMention){
 			if(((EntityMention)a1).getPolarity() == 1.0 &&
 			   ((EntityMention)a2).getPolarity() == 1.0){
-				return 1.0;
-			}else{
-				return 0.0;
+				return true;
 			}
-		}else{
-			return 0.0;
 		}
+		return false;
 	}
 	
-	public String calcClosestComp () {
-		if (calcWnClass().equals("C")) {
+	public boolean calcClosestComp () {
+		if (calcWnClass()) {
 			ArrayList<Annotation> l = AnnotationSelector.selectNE(jcas);
 			int m2type = ((IdentifiedAnnotation)m2.getContent()).getTypeID();
 			for (Annotation a : l) {
 				if (((IdentifiedAnnotation)a).getTypeID()==m2type &&
 					a.getBegin()>=m1.getEnd() &&
 					a.getEnd()<=m2.getBegin())
-					return "I";
+					return false;
 			}
-			return "C";
+			return true;
 		}
-		return "I";
+		return false;
 	}
 
-	public String calcNPHead () {
+	public boolean calcNPHead () {
 		Annotation a = m1.getContent();
 //		return (a.getEnd()==m1.getEnd() && a.getBegin()>m1.getBegin()) ? "yes" : "no";
 		FSIterator iter = jcas.getJFSIndexRepository().getAnnotationIndex(LookupWindowAnnotation.type).iterator();
 		while (iter.hasNext()) {
 			LookupWindowAnnotation lwa = (LookupWindowAnnotation) iter.next();
 			if (lwa.getBegin()<=a.getBegin() && lwa.getEnd()==a.getEnd())
-				return "yes";
+				return true;
 		}
-		return "no";
+		return false;
 	}
 
 	
@@ -584,4 +606,24 @@ public class PairAttributeCalculator ext
 		return 0.0;
 	}
 
+	public boolean calcAliasDrug (){
+		return (alias && calcIsDrug());
+	}
+
+	public boolean calcAliasDisorder(){
+		return (alias && calcIsDisorder());
+	}
+
+	public boolean calcAliasFinding(){
+		return (alias && calcIsFinding());
+	}
+
+	public boolean calcAliasProcedure(){
+		return (alias && calcIsProcedure());
+	}
+
+	public boolean calcAliasAnatomy(){
+		return (alias && calcIsAnatomicalSite());
+	}
+
 }

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SvmVectorCreator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SvmVectorCreator.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SvmVectorCreator.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SvmVectorCreator.java Mon Jan  7 22:49:52 2013
@@ -18,39 +18,44 @@
  */
 package org.apache.ctakes.coreference.util;
 
+import java.io.File;
+import java.io.IOException;
 import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Collection;
-import java.util.LinkedList;
 import java.util.HashSet;
+import java.util.LinkedList;
 
 import libsvm.svm;
 import libsvm.svm_model;
 import libsvm.svm_node;
 
-import opennlp.tools.parser.Parse;
-
 import org.apache.ctakes.constituency.parser.treekernel.TreeExtractor;
 import org.apache.ctakes.constituency.parser.util.TreeUtils;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-
+import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.coreference.type.Markable;
 import org.apache.ctakes.utils.tree.FragmentUtils;
 import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.ctakes.utils.wiki.WikiIndex;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.pear.util.FileUtil;
 
 public class SvmVectorCreator {
 	HashSet<String> stopwords = null;
 	private svm_model anaph_model = null;
 	ArrayList<SimpleTree> frags = new ArrayList<SimpleTree>();
+	WikiIndex wiki = null;
+	static final int NUM_WIKI_HITS = 5;
 	
 	public SvmVectorCreator(HashSet<String> stopwords){
 		this.stopwords = stopwords;
-	}
-
-	public SvmVectorCreator(HashSet<String> stopwords, svm_model anaph){
-		this.stopwords = stopwords;
-		anaph_model = anaph;
+		try{
+			wiki = new WikiIndex(NUM_WIKI_HITS, FileLocator.locateFile("org/apache/ctakes/coreference/models/index_med_5k").getAbsolutePath(), "text");
+			wiki.initialize();
+		}catch(IOException e){
+			e.printStackTrace();
+			wiki = null;
+		}
 	}
 
 	public svm_node[] createAnaphoricityVector(Markable m, JCas aJCas) {
@@ -118,7 +123,7 @@ public class SvmVectorCreator {
 	public svm_node[] getNodeFeatures(Markable anaphor, Markable antecedent, JCas aJCas, boolean needsAnaph) {
 		LinkedList<svm_node> nodes = new LinkedList<svm_node>();
 		String[] feats = FeatureVector.getNECorefFeatures();
-		SyntaxAttributeCalculator sac = new SyntaxAttributeCalculator(aJCas, antecedent, anaphor);
+		SyntaxAttributeCalculator sac = new SyntaxAttributeCalculator(aJCas, antecedent, anaphor, wiki);
 		sac.setStopWordsList(stopwords);
 		int ind = 0;
 		for (int i = 0; i < feats.length; i++, ind++) {
@@ -183,7 +188,15 @@ public class SvmVectorCreator {
 							n.value = (Double) val;
 							nodes.add(n);
 						}
+					}else if (val instanceof Boolean) {
+						if((Boolean) val == true){
+							svm_node n = new svm_node();
+							n.index = ind + 1;
+							n.value = 1.0;
+							nodes.add(n);
+						}
 					}
+
 				}
 			} catch (Exception e) { e.printStackTrace(); }
 		}

Modified: incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SyntaxAttributeCalculator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SyntaxAttributeCalculator.java?rev=1430073&r1=1430072&r2=1430073&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SyntaxAttributeCalculator.java (original)
+++ incubator/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/util/SyntaxAttributeCalculator.java Mon Jan  7 22:49:52 2013
@@ -33,6 +33,7 @@ import org.apache.ctakes.core.resource.F
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.utils.wiki.WikiIndex;
 import org.apache.ctakes.coreference.type.Markable;
 import org.apache.ctakes.coreference.type.DemMarkable;
 import org.apache.ctakes.coreference.type.NEMarkable;
@@ -47,6 +48,9 @@ public class SyntaxAttributeCalculator e
 	ConllDependencyNode depLca=null;
 	String path = null;
 	String depPath = null;
+	WikiIndex wiki = null;
+	double sim1=-1.0;
+	double sim2=-1.0;
 	private static int numNEFeats = 0;
 	private static int numDemFeats = 0;
 	private static int numPronFeats = 0;
@@ -77,16 +81,14 @@ public class SyntaxAttributeCalculator e
 	static int[] selFeats = {0};
 	static int[] pronSelFeats = {0};
 	
-	static{
-		// TODO initialize feature types...
-		// read in feature files for each classifier type
-		// TODO don't hard code these file names or at least not the assumption they are not in a subdir of what's on the path
-		featSet = loadFeatures(selFeats, "ngramids.mayo.txt");
-		pronFeatSet = loadFeatures(pronSelFeats, "pronngramids.mayo.txt");
-		numNEFeats = selFeats.length;
-		numDemFeats = 0;
-		numPronFeats = pronSelFeats.length;
-	}
+//	static{
+//		// read in feature files for each classifier type
+//		featSet = loadFeatures(selFeats, "ngramids.mayo.txt");
+//		pronFeatSet = loadFeatures(pronSelFeats, "pronngramids.mayo.txt");
+//		numNEFeats = selFeats.length;
+//		numDemFeats = 0;
+//		numPronFeats = pronSelFeats.length;
+//	}
 
 	static ArrayList<String> loadFeatures(int[] featInds, String filename){
 		ArrayList<String> feats = new ArrayList<String>();
@@ -113,7 +115,11 @@ public class SyntaxAttributeCalculator e
 		return feats;
 	}
 
-	public SyntaxAttributeCalculator(JCas jcas, Markable m1, Markable m2) {
+	public SyntaxAttributeCalculator(JCas jcas, Markable m1, Markable m2){
+		this(jcas,m1,m2,null);
+	}
+	
+	public SyntaxAttributeCalculator(JCas jcas, Markable m1, Markable m2, WikiIndex wiki) {
 		super(jcas,m1,m2);
 		n1 = MarkableTreeUtils.markableNode(jcas, m1.getBegin(), m1.getEnd());
 		n2 = MarkableTreeUtils.markableNode(jcas, m2.getBegin(), m2.getEnd());
@@ -121,13 +127,13 @@ public class SyntaxAttributeCalculator e
 		while(true){
 			if(n1 == null || lca == null || lca.getBegin() <= n1.getBegin()){
 				break;
-			}else{
-				lca = lca.getParent();
 			}
+			lca = lca.getParent();
 		}
 		ngrams = new HashMap<String,Integer>();
 		calcFullPath();
-
+		this.wiki = wiki;
+		if(this.wiki != null) initWikiSim();
 //		c1 = MarkableDepUtils.markableNode(jcas, m1.getBegin(), m1.getEnd(), n1);
 //		c2 = MarkableDepUtils.markableNode(jcas, m2.getBegin(), m2.getEnd(), n2);
 //		depLca = getDepLCA(c1,c2);
@@ -138,12 +144,11 @@ public class SyntaxAttributeCalculator e
 	public static int getNumDemFeats(){ return numDemFeats; }
 	public static int getNumPronFeats(){ return numPronFeats; }
 	
-	private String calcNPunderPP(TreebankNode n){
+	private static String calcNPunderPP(TreebankNode n){
 		if(n != null && n.getParent() != null && n.getParent().getNodeType().equals("PP")){
 			return "Y";
-		}else{
-			return "N";
 		}
+		return "N";
 	}
 
 	public String calcNPunderPP1(){
@@ -154,12 +159,11 @@ public class SyntaxAttributeCalculator e
 		return calcNPunderPP(n2);
 	}
 
-	private String calcNPunderS(TreebankNode n){
+	private static String calcNPunderS(TreebankNode n){
 		if(n != null && n.getParent() != null && n.getParent().getNodeType().equals("S")){
 			return "Y";
-		}else{
-			return "N";
 		}
+		return "N";
 	}
 
 	public String calcNPunderS1(){
@@ -170,12 +174,11 @@ public class SyntaxAttributeCalculator e
 		return calcNPunderS(n2);
 	}
 
-	private String calcNPunderVP(TreebankNode n){
+	private static String calcNPunderVP(TreebankNode n){
 		if(n != null && n.getParent() != null && n.getParent().getNodeType().equals("VP")){
 			return "Y";
-		}else{
-			return "N";
 		}
+		return "N";
 	}
 
 	public String calcNPunderVP1(){
@@ -186,31 +189,71 @@ public class SyntaxAttributeCalculator e
 		return calcNPunderVP(n2);
 	}
 
-	public String calcNPSubj(TreebankNode n){
-		if(n == null) return "N";
+	public boolean calcNPSubj(TreebankNode n){
+		if(n == null) return false;
 		if(n.getNodeType().equals("NP")){
 			StringArray tags = n.getNodeTags();
-			if(tags.size() > 0){
+			if(tags != null && tags.size() > 0){
 				for(int i = 0; i < tags.size(); i++){
 					if(tags.get(i).equals("SBJ")){
-						return "Y";
+						return true;
 					}
 				}
 			}
 		}
-		return "N";
+		return false;
 	}
 
-	public String calcNPSubj1(){
+	public boolean calcNPSubj1(){
 		return calcNPSubj(n1);
 	}
 	
-	public String calcNPSubj2(){
+	public boolean calcNPSubj2(){
 		return calcNPSubj(n2);
 	}
 	
-	public String calcNPSubjBoth(){
-		return ((calcNPSubj1().equals("Y") && calcNPSubj2().equals("Y")) ? "Y" : "N");
+	public boolean calcNPSubjBoth(){
+		return (calcNPSubj1() && calcNPSubj2());
+	}
+
+	public void initWikiSim(){
+		if(wiki == null) sim1 = 0.0;
+		else{
+			try{
+				sim1 = wiki.getCosineSimilarity(ms1, ms2);
+				sim2 = wiki.getCosineSimilarity(es1, es2);
+			}catch(Exception e){
+				sim1 = 0.0;
+				sim2 = 0.0;
+			}
+		}
+	}
+
+	public void initEntityWikiSim(){
+		if(wiki == null) sim2 = 0.0;
+		else{
+			try{
+				sim2 = wiki.getCosineSimilarity(es1, es2);
+			}catch(Exception e){
+				sim2 = 0.0;
+			}
+		}		
+	}
+	
+	public double calcWikiSim(){
+		if(sim1 < 0.0) initWikiSim();
+		return sim1;
+	}
+	
+	public double calcEntityWikiSim(){
+		if(sim2 < 0.0) initEntityWikiSim();
+		return sim2;
+	}
+	
+	public double calcSimSum(){
+		if(sim1 < 0.0) initWikiSim();
+		if(sim2 < 0.0) initEntityWikiSim();
+		return (sim1+sim2)/2.0;
 	}
 
 	public int numNgrams(Markable m) throws UnexpectedException{
@@ -288,12 +331,11 @@ public class SyntaxAttributeCalculator e
 	}
 	
 	public int getPathLength(){
-		String path = calcFullPath();
 		String[] nodes = path.split("[<>]");
-		return nodes.length;		
+		return nodes.length;
 	}
 
-	private ConllDependencyNode getDepLCA(ConllDependencyNode c1, ConllDependencyNode c2) {
+	private static ConllDependencyNode getDepLCA(ConllDependencyNode c1, ConllDependencyNode c2) {
 		HashSet<Annotation> ancestors = new HashSet<Annotation>();
 		ConllDependencyNode temp = null;
 		temp = c2.getHead();
@@ -361,7 +403,7 @@ public class SyntaxAttributeCalculator e
 		return depPath;
 	}
 
-	private void initNGrams(HashMap<String,Integer> ngrams, String path, int n) {
+	private static void initNGrams(HashMap<String,Integer> ngrams, String path, int n) {
 		// Find the collection of trigrams in this string and add them to the hash map.
 		// start by finding the endpoint of the first trigram, then iteratively move the endpoint forward one unit
 		// while moving a beginning point forward one gram as well.

Added: incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdt
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdt?rev=1430073&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdt
------------------------------------------------------------------------------
    svn:executable = *

Propchange: incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdx
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-coreference/src/main/resources/org/apache/ctakes/coreference/models/index_med_5k/_3.fdx?rev=1430073&view=auto
==============================================================================
Binary file - no diff available.