You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/07/16 16:17:41 UTC
svn commit: r1611017 - in
/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features:
DistSemFeatureExtractor.java TokenFeatureExtractor.java
UMLSFeatureExtractor.java
Author: tmill
Date: Wed Jul 16 14:17:40 2014
New Revision: 1611017
URL: http://svn.apache.org/r1611017
Log:
CTAKES-199: Features for relation-based coreference.
Added:
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
Modified:
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java?rev=1611017&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistSemFeatureExtractor.java Wed Jul 16 14:17:40 2014
@@ -0,0 +1,105 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.classifier.Feature;
+import org.uimafit.util.JCasUtil;
+
+public class DistSemFeatureExtractor implements RelationFeaturesExtractor {
+
+ // default value is 0.5 (rather than 0.0) because we don't want to assume OOV words are dissimilar
+ public static final double DEFAULT_SIM = 0.5;
+
+ private WordEmbeddings words = null;
+
+ public DistSemFeatureExtractor() throws FileNotFoundException, IOException{
+ words = WordVectorReader.getEmbeddings(FileLocator.getAsStream("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+ }
+
+ @Override
+ public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1,
+ IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+ List<Feature> feats = new ArrayList<>();
+
+ double sim = 0.0;
+// double[] a1vec = getArgVector(arg1);
+// double[] a2vec = getArgVector(arg2);
+//
+// if(a1vec != null && a2vec != null){
+// for(int i = 0; i < a1vec.length; i++){
+// sim += a1vec[i] * a2vec[i];
+// }
+// }else{
+// sim = DEFAULT_SIM;
+// }
+//
+// assert !Double.isNaN(sim);
+//
+// feats.add(new Feature("ARG_SIMILARITY_WORD2VEC", sim));
+
+ ConllDependencyNode node1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+ ConllDependencyNode node2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+ String head1 = node1 != null ? node1.getCoveredText().toLowerCase() : null;
+ String head2 = node2 != null ? node2.getCoveredText().toLowerCase() : null;
+ if(head1 != null && head2 != null && words.containsKey(head1) && words.containsKey(head2)){
+ sim = words.getSimilarity(head1, head2);
+ }else{
+ sim = DEFAULT_SIM;
+ }
+ feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", sim));
+
+ return feats;
+ }
+
+
+ private double[] getArgVector(IdentifiedAnnotation arg){
+ double[] vec = null;
+
+ Collection<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class, arg);
+
+ for(BaseToken token : tokens){
+ WordVector wv = words.getVector(token.getCoveredText());
+ if(wv == null){
+ wv = words.getVector(token.getCoveredText().toLowerCase());
+ }
+ if(wv != null){
+ if(vec == null){
+ vec = new double[wv.size()];
+ Arrays.fill(vec, 0.0);
+ }
+ for(int i = 0; i < vec.length; i++){
+ vec[i] += wv.getValue(i);
+ }
+ }
+ }
+
+ if(vec != null){
+ double len = 0.0;
+ for(int i = 0; i < vec.length; i++){
+ len += vec[i]*vec[i];
+ }
+ len = Math.sqrt(len);
+ assert !Double.isNaN(len);
+ for(int i = 0; i < vec.length; i++){
+ vec[i] /= len;
+ }
+ }
+ return vec;
+ }
+}
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1611017&r1=1611016&r2=1611017&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java Wed Jul 16 14:17:40 2014
@@ -30,6 +30,18 @@ public class TokenFeatureExtractor imple
feats.add(new Feature("TOKEN_DEF2", isDefinite(s2)));
feats.add(new Feature("TOKEN_NUMAGREE",
numberSingular(arg1) == numberSingular(arg2)));
+
+ String gen1 = getGender(s1);
+ String gen2 = getGender(s2);
+ feats.add(new Feature("TOKEN_GEN1", gen1));
+ feats.add(new Feature("TOKEN_GEN2", gen2));
+ feats.add(new Feature("TOKEN_GENAGREE", gen1.equals(gen2)));
+
+ String p1 = getPerson(s1);
+ String p2 = getPerson(s2);
+ feats.add(new Feature("TOKEN_PERSON1", p1));
+ feats.add(new Feature("TOKEN_PERSON2", p2));
+ feats.add(new Feature("TOKEN_PERSONAGREE", p1.equals(p2)));
return feats;
}
@@ -53,12 +65,34 @@ public class TokenFeatureExtractor imple
for (int i = tokens.size()-1; i >=0; i--){
BaseToken t = tokens.get(i);
String pos = t.getPartOfSpeech();
- if (pos.equals("NN") || pos.equals("NNP")){
+ if ("NN".equals(pos) || "NNP".equals(pos)){
return true;
- }else if (pos.equals("NNS") || pos.equals("NNPS")){
+ }else if ("NNS".equals(pos) || "NNPS".equals(pos)){
return false;
+ }else if(t.getCoveredText().toLowerCase().equals("we")){
+ return true;
}
}
return true;
}
+
+ public static String getGender(String s1){
+ if(s1.equals("he") || s1.equals("his") || s1.startsWith("mr.")) return "MALE";
+ else if(s1.equals("she") || s1.equals("her") || s1.startsWith("mrs.") || s1.startsWith("ms.")) return "FEMALE";
+ else return "NEUTER";
+ }
+
+ public static String getPerson(String s1){
+ if(s1.equals("i")) return "FIRST";
+ else if(s1.equals("he") || s1.equals("she") || s1.equals("his") || s1.equals("her") || s1.equals("hers")){
+ return "THIRD";
+ }else if(s1.equals("you") || s1.equals("your")) return "SECOND";
+ else if(s1.equals("we")) return "FIRST_PLURAL";
+ else return "NONE";
+ }
+
+ public static boolean getAnimate(String s1){
+ if(s1.equals("i")) return true;
+ return false;
+ }
}
Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1611017&r1=1611016&r2=1611017&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Wed Jul 16 14:17:40 2014
@@ -4,13 +4,17 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.cleartk.classifier.Feature;
+import org.uimafit.util.JCasUtil;
public class UMLSFeatureExtractor implements RelationFeaturesExtractor {
@@ -19,28 +23,51 @@ public class UMLSFeatureExtractor implem
IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
List<Feature> feats = new ArrayList<Feature>();
- feats.add(new Feature("UMLS_ALIAS", alias(arg1, arg2)));
-
+ if(arg1 instanceof Markable && arg2 instanceof Markable){
+ // get the head of each markable
+ ConllDependencyNode head1 = DependencyUtility.getNominalHeadNode(jCas, arg1);
+ ConllDependencyNode head2 = DependencyUtility.getNominalHeadNode(jCas, arg2);
+
+ if(head1 != null && head2 != null){
+ List<IdentifiedAnnotation> ents1 = JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());
+ List<IdentifiedAnnotation> ents2 = JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
+
+ for(IdentifiedAnnotation ent1 : ents1){
+ for(IdentifiedAnnotation ent2 : ents2){
+ if(alias(ent1, ent2)){
+ feats.add(new Feature("UMLS_ALIAS", true));
+ break;
+ }
+ }
+ }
+ }
+ }
return feats;
}
- public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
- FSArray fsa = a1.getOntologyConceptArr();
- HashSet<String> cuis = new HashSet<String>();
- for(int i = 0; i < fsa.size(); i++){
- if(fsa.get(i) instanceof UmlsConcept){
- cuis.add(((UmlsConcept)fsa.get(i)).getCui());
- }
- }
- fsa = a2.getOntologyConceptArr();
- for(int i = 0; i < fsa.size(); i++){
- if(fsa.get(i) instanceof UmlsConcept){
- if(cuis.contains(((UmlsConcept)fsa.get(i)).getCui())){
- return true;
- }
- }
- }
-
+ public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+ if(a1 != null && a2 != null){
+ FSArray fsa = a1.getOntologyConceptArr();
+ if(fsa != null){
+ HashSet<String> cuis = new HashSet<String>();
+ for(int i = 0; i < fsa.size(); i++){
+ if(fsa.get(i) instanceof UmlsConcept){
+ cuis.add(((UmlsConcept)fsa.get(i)).getCui());
+ }
+ }
+
+ fsa = a2.getOntologyConceptArr();
+ if(fsa != null){
+ for(int i = 0; i < fsa.size(); i++){
+ if(fsa.get(i) instanceof UmlsConcept){
+ if(cuis.contains(((UmlsConcept)fsa.get(i)).getCui())){
+ return true;
+ }
+ }
+ }
+ }
+ }
+ }
return false;
}
}