You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2016/05/23 14:42:16 UTC

svn commit: r1745202 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae: EventEventRelationAnnotator.java EventTimeSelfRelationAnnotator.java feature/RelationEmbeddingFeatureExtractor.java

Author: clin
Date: Mon May 23 14:42:16 2016
New Revision: 1745202

URL: http://svn.apache.org/viewvc?rev=1745202&view=rev
Log:
adding embedding features for event-time and event-event relation annotators

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java?rev=1745202&r1=1745201&r2=1745202&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java Mon May 23 14:42:16 2016
@@ -44,6 +44,7 @@ import org.apache.ctakes.temporal.ae.fea
 import org.apache.ctakes.temporal.ae.feature.TemporalPETFlatExtractor;
 import org.apache.ctakes.temporal.ae.feature.TokenPropertyFeaturesExtractor;
 import org.apache.ctakes.temporal.ae.feature.DeterminerRelationFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.RelationEmbeddingFeatureExtractor;
 import org.apache.ctakes.temporal.ae.feature.EventArgumentPropertyExtractor;
 import org.apache.ctakes.temporal.ae.feature.EventTimeRelationFeatureExtractor;
 import org.apache.ctakes.temporal.ae.feature.EventPositionRelationFeaturesExtractor;
@@ -72,6 +73,7 @@ import org.apache.uima.jcas.tcas.Annotat
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.ml.CleartkAnnotator;
 import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.jar.DefaultDataWriterFactory;
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
 import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -125,12 +127,22 @@ public class EventEventRelationAnnotator
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(modelDirectory, "model.jar"));
 	}
+	
+	private RelationEmbeddingFeatureExtractor embedingExtractor;
 
 	@Override
 	protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+		final String vectorFile = "org/apache/ctakes/temporal/mimic_vectors.txt";
+		try {
+			this.embedingExtractor = new RelationEmbeddingFeatureExtractor(vectorFile);
+		} catch (CleartkExtractorException e) {
+			System.err.println("cannot find file: "+ vectorFile);
+			e.printStackTrace();
+		}
 		return Lists.newArrayList(
 				new UnexpandedTokenFeaturesExtractor() //new TokenFeaturesExtractor()		
 //				, new EmptyFeaturesExtractor()
+				, embedingExtractor
 				, new PartOfSpeechFeaturesExtractor()
 				, new EventArgumentPropertyExtractor()
 				, new UmlsFeatureExtractor()

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java?rev=1745202&r1=1745201&r2=1745202&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java Mon May 23 14:42:16 2016
@@ -30,8 +30,10 @@ import org.apache.ctakes.relationextract
 import org.apache.ctakes.relationextractor.ae.features.TokenFeaturesExtractor;
 import org.apache.ctakes.temporal.ae.feature.CheckSpecialWordRelationExtractor;
 import org.apache.ctakes.temporal.ae.feature.ConjunctionRelationFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.ContinuousTextExtractor;
 import org.apache.ctakes.temporal.ae.feature.DependencyFeatureExtractor;
 import org.apache.ctakes.temporal.ae.feature.DependencyPathFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.RelationEmbeddingFeatureExtractor;
 import org.apache.ctakes.temporal.ae.feature.EmptyFeaturesExtractor;
 import org.apache.ctakes.temporal.ae.feature.EventArgumentPropertyExtractor;
 import org.apache.ctakes.temporal.ae.feature.MultiTokenFeaturesExtractor;
@@ -63,12 +65,14 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.ml.CleartkAnnotator;
 import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.jar.DefaultDataWriterFactory;
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
 import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -120,11 +124,21 @@ public class EventTimeSelfRelationAnnota
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(modelDirectory, "model.jar"));
 	}
+	
+	private RelationEmbeddingFeatureExtractor embedingExtractor;
 
 	@Override
 	protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+		final String vectorFile = "org/apache/ctakes/temporal/mimic_vectors.txt";
+		try {
+			this.embedingExtractor = new RelationEmbeddingFeatureExtractor(vectorFile);
+		} catch (CleartkExtractorException e) {
+			System.err.println("cannot find file: "+ vectorFile);
+			e.printStackTrace();
+		}
 		return Lists.newArrayList(
-				new UnexpandedTokenFeaturesExtractor()//new TokenFeaturesExtractor()							
+				new UnexpandedTokenFeaturesExtractor()//new TokenFeaturesExtractor()	
+				, embedingExtractor
 				, new NearestFlagFeatureExtractor()
 				, new DependencyPathFeaturesExtractor()
 				, new EventArgumentPropertyExtractor()

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java?rev=1745202&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java Mon May 23 14:42:16 2016
@@ -0,0 +1,205 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+
+/**
+ * Word embedding based features.
+ */
+public class RelationEmbeddingFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+	private int numberOfDimensions;
+	private WordEmbeddings words = null;
+
+	public RelationEmbeddingFeatureExtractor(String vecFile) throws
+	CleartkExtractorException {
+		try {
+			words =
+					WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile));
+		} catch (IOException e) {
+			e.printStackTrace();
+			throw new CleartkExtractorException(e);
+		}
+		numberOfDimensions = words.getDimensionality();
+	}
+
+	@Override
+	public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1, IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+
+		List<Feature> features = new ArrayList<>();
+
+//		String arg1LastWord = Utils.getLastWord(jCas, arg1).toLowerCase();
+//		String arg2LastWord = Utils.getLastWord(jCas, arg2).toLowerCase();
+
+		//		WordVector arg1Vector;
+		//		if(words.containsKey(arg1LastWord)) {
+		//			arg1Vector = words.getVector(arg1LastWord);
+		//		} else {
+		//			arg1Vector = words.getVector("and");
+		//		}
+		//		WordVector arg2Vector;
+		//		if(words.containsKey(arg2LastWord)) {
+		//			arg2Vector = words.getVector(arg2LastWord);
+		//		} else {
+		//			arg2Vector = words.getVector("and");
+		//		}
+
+		// head word features
+		//		for(int dim = 0; dim < numberOfDimensions; dim++) {
+		//			String featureName = String.format("arg1_dim_%d", dim);
+		//			features.add(new Feature(featureName, arg1Vector.getValue(dim)));
+		//		}
+		//		for(int dim = 0; dim < numberOfDimensions; dim++) {
+		//			String featureName = String.format("arg2_dim_%d", dim);
+		//			features.add(new Feature(featureName, arg2Vector.getValue(dim)));
+		//		}    
+
+		// head word similarity features
+//		List<WordToken> wordsOfArgs1 = JCasUtil.selectCovered(jCas, WordToken.class, arg1);
+//		List<Double> arg1Vec = getGroupVector(wordsOfArgs1);
+		
+//		List<WordToken> wordsOfArgs2 = JCasUtil.selectCovered(jCas, WordToken.class, arg2);
+//		List<Double> arg2Vec = getGroupVector(wordsOfArgs2);
+
+//		for(int dim = 0; dim < numberOfDimensions; dim++) {
+//			String featureName = String.format("arg1_dim_%d", dim);
+//			features.add(new Feature(featureName, arg1Vec.get(dim)));
+//		}
+//		for(int dim = 0; dim < numberOfDimensions; dim++) {
+//			String featureName = String.format("arg2_dim_%d", dim);
+//			features.add(new Feature(featureName, arg2Vec.get(dim)));
+//		}
+
+
+//		double similarity = computeCosineSimilarity(arg1Vec, arg2Vec); 
+//		features.add(new Feature("arg_cos_sim", similarity));
+
+
+		// words between argument features
+		List<WordToken> wordsBetweenArgs = JCasUtil.selectBetween(jCas, WordToken.class, arg1, arg2);
+//		wordsBetweenArgs.addAll(wordsOfArgs1);
+//		wordsBetweenArgs.addAll(wordsOfArgs2);
+		if(wordsBetweenArgs.size() < 1) {
+			return features;  
+		}
+
+		List<Double> sum = new ArrayList<>(Collections.nCopies(numberOfDimensions, 0.0));
+		for(WordToken wordToken : wordsBetweenArgs) {
+			WordVector wordVector;
+			if(words.containsKey(wordToken.getCoveredText().toLowerCase())) {
+				wordVector = words.getVector(wordToken.getCoveredText().toLowerCase());
+			} else {
+				wordVector = words.getVector("and");
+			}
+			sum = addVectors(sum, wordVector);      
+		}
+
+		for(int dim = 0; dim < numberOfDimensions; dim++) {
+			String featureName = String.format("average_dim_%d", dim);
+			features.add(new Feature(featureName, sum.get(dim) / wordsBetweenArgs.size()));
+		}
+
+		return features;
+	}
+
+	/**
+	private List<Double> getGroupVector(List<WordToken> wordsOfArgs) {
+		List<Double> argVec = new ArrayList<>(Collections.nCopies(numberOfDimensions, 0.0));
+
+		for(WordToken wordToken : wordsOfArgs) {
+			WordVector wordVector;
+			if(words.containsKey(wordToken.getCoveredText().toLowerCase())) {
+				wordVector = words.getVector(wordToken.getCoveredText().toLowerCase());
+			} else {
+				wordVector = words.getVector("and");
+			}
+			argVec = addVectors(argVec, wordVector);    
+		}
+		int numOfWords = wordsOfArgs.size();
+		if( numOfWords > 1){
+			for(int dim = 0; dim < numberOfDimensions; dim++) {
+				argVec.set(dim, argVec.get(dim) /numOfWords) ;
+			}
+		}
+		return argVec;
+	}*/
+
+	/**
+	 * Compute cosine similarity between two vectors.
+	 */
+	public double computeCosineSimilarity(WordVector vector1, WordVector vector2) {
+
+		double dotProduct = 0.0;
+		double norm1 = 0.01;
+		double norm2 = 0.01;
+
+		for (int dim = 0; dim < numberOfDimensions; dim++) {
+			dotProduct = dotProduct + vector1.getValue(dim) * vector2.getValue(dim);
+			norm1 = norm1 + Math.pow(vector1.getValue(dim), 2);
+			norm2 = norm2 + Math.pow(vector2.getValue(dim), 2);
+		}
+
+		return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
+	}
+
+	public double computeCosineSimilarity(List<Double> vector1, List<Double> vector2) {
+
+		double dotProduct = 0.0;
+		double norm1 = 0.01;
+		double norm2 = 0.01;
+
+		for (int dim = 0; dim < numberOfDimensions; dim++) {
+			dotProduct = dotProduct + vector1.get(dim) * vector2.get(dim);
+			norm1 = norm1 + Math.pow(vector1.get(dim), 2);
+			norm2 = norm2 + Math.pow(vector2.get(dim), 2);
+		}
+
+		return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
+	}
+
+	/**
+	 * Add two vectors. Return the sum vector.
+	 */
+	public List<Double> addVectors(List<Double> vector1, WordVector vector2) {
+
+		List<Double> sum = new ArrayList<>();
+		for(int dim = 0; dim < numberOfDimensions; dim++) {
+			sum.add(vector1.get(dim) + vector2.getValue(dim));
+		}
+
+		return sum;
+	}
+}