You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by cl...@apache.org on 2016/05/23 14:42:16 UTC
svn commit: r1745202 - in
/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae:
EventEventRelationAnnotator.java EventTimeSelfRelationAnnotator.java
feature/RelationEmbeddingFeatureExtractor.java
Author: clin
Date: Mon May 23 14:42:16 2016
New Revision: 1745202
URL: http://svn.apache.org/viewvc?rev=1745202&view=rev
Log:
adding embedding features for event-time and event-event relation annotators
Added:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java
Modified:
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java
ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java?rev=1745202&r1=1745201&r2=1745202&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventEventRelationAnnotator.java Mon May 23 14:42:16 2016
@@ -44,6 +44,7 @@ import org.apache.ctakes.temporal.ae.fea
import org.apache.ctakes.temporal.ae.feature.TemporalPETFlatExtractor;
import org.apache.ctakes.temporal.ae.feature.TokenPropertyFeaturesExtractor;
import org.apache.ctakes.temporal.ae.feature.DeterminerRelationFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.RelationEmbeddingFeatureExtractor;
import org.apache.ctakes.temporal.ae.feature.EventArgumentPropertyExtractor;
import org.apache.ctakes.temporal.ae.feature.EventTimeRelationFeatureExtractor;
import org.apache.ctakes.temporal.ae.feature.EventPositionRelationFeaturesExtractor;
@@ -72,6 +73,7 @@ import org.apache.uima.jcas.tcas.Annotat
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.ml.CleartkAnnotator;
import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
import org.cleartk.ml.jar.DefaultDataWriterFactory;
import org.cleartk.ml.jar.DirectoryDataWriterFactory;
import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -125,12 +127,22 @@ public class EventEventRelationAnnotator
GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
new File(modelDirectory, "model.jar"));
}
+
+ private RelationEmbeddingFeatureExtractor embedingExtractor;
@Override
protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+ final String vectorFile = "org/apache/ctakes/temporal/mimic_vectors.txt";
+ try {
+ this.embedingExtractor = new RelationEmbeddingFeatureExtractor(vectorFile);
+ } catch (CleartkExtractorException e) {
+ System.err.println("cannot find file: "+ vectorFile);
+ e.printStackTrace();
+ }
return Lists.newArrayList(
new UnexpandedTokenFeaturesExtractor() //new TokenFeaturesExtractor()
// , new EmptyFeaturesExtractor()
+ , embedingExtractor
, new PartOfSpeechFeaturesExtractor()
, new EventArgumentPropertyExtractor()
, new UmlsFeatureExtractor()
Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java?rev=1745202&r1=1745201&r2=1745202&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java (original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventTimeSelfRelationAnnotator.java Mon May 23 14:42:16 2016
@@ -30,8 +30,10 @@ import org.apache.ctakes.relationextract
import org.apache.ctakes.relationextractor.ae.features.TokenFeaturesExtractor;
import org.apache.ctakes.temporal.ae.feature.CheckSpecialWordRelationExtractor;
import org.apache.ctakes.temporal.ae.feature.ConjunctionRelationFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.ContinuousTextExtractor;
import org.apache.ctakes.temporal.ae.feature.DependencyFeatureExtractor;
import org.apache.ctakes.temporal.ae.feature.DependencyPathFeaturesExtractor;
+import org.apache.ctakes.temporal.ae.feature.RelationEmbeddingFeatureExtractor;
import org.apache.ctakes.temporal.ae.feature.EmptyFeaturesExtractor;
import org.apache.ctakes.temporal.ae.feature.EventArgumentPropertyExtractor;
import org.apache.ctakes.temporal.ae.feature.MultiTokenFeaturesExtractor;
@@ -63,12 +65,14 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.typesystem.type.textsem.TimeMention;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.cleartk.ml.CleartkAnnotator;
import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
import org.cleartk.ml.jar.DefaultDataWriterFactory;
import org.cleartk.ml.jar.DirectoryDataWriterFactory;
import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -120,11 +124,21 @@ public class EventTimeSelfRelationAnnota
GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
new File(modelDirectory, "model.jar"));
}
+
+ private RelationEmbeddingFeatureExtractor embedingExtractor;
@Override
protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>> getFeatureExtractors() {
+ final String vectorFile = "org/apache/ctakes/temporal/mimic_vectors.txt";
+ try {
+ this.embedingExtractor = new RelationEmbeddingFeatureExtractor(vectorFile);
+ } catch (CleartkExtractorException e) {
+ System.err.println("cannot find file: "+ vectorFile);
+ e.printStackTrace();
+ }
return Lists.newArrayList(
- new UnexpandedTokenFeaturesExtractor()//new TokenFeaturesExtractor()
+ new UnexpandedTokenFeaturesExtractor()//new TokenFeaturesExtractor()
+ , embedingExtractor
, new NearestFlagFeatureExtractor()
, new DependencyPathFeaturesExtractor()
, new EventArgumentPropertyExtractor()
Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java?rev=1745202&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/RelationEmbeddingFeatureExtractor.java Mon May 23 14:42:16 2016
@@ -0,0 +1,205 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+
+/**
+ * Word embedding based features.
+ */
+public class RelationEmbeddingFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+ private int numberOfDimensions;
+ private WordEmbeddings words = null;
+
+ public RelationEmbeddingFeatureExtractor(String vecFile) throws
+ CleartkExtractorException {
+ try {
+ words =
+ WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile));
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new CleartkExtractorException(e);
+ }
+ numberOfDimensions = words.getDimensionality();
+ }
+
+ @Override
+ public List<Feature> extract(JCas jCas, IdentifiedAnnotation arg1, IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
+
+ List<Feature> features = new ArrayList<>();
+
+// String arg1LastWord = Utils.getLastWord(jCas, arg1).toLowerCase();
+// String arg2LastWord = Utils.getLastWord(jCas, arg2).toLowerCase();
+
+ // WordVector arg1Vector;
+ // if(words.containsKey(arg1LastWord)) {
+ // arg1Vector = words.getVector(arg1LastWord);
+ // } else {
+ // arg1Vector = words.getVector("and");
+ // }
+ // WordVector arg2Vector;
+ // if(words.containsKey(arg2LastWord)) {
+ // arg2Vector = words.getVector(arg2LastWord);
+ // } else {
+ // arg2Vector = words.getVector("and");
+ // }
+
+ // head word features
+ // for(int dim = 0; dim < numberOfDimensions; dim++) {
+ // String featureName = String.format("arg1_dim_%d", dim);
+ // features.add(new Feature(featureName, arg1Vector.getValue(dim)));
+ // }
+ // for(int dim = 0; dim < numberOfDimensions; dim++) {
+ // String featureName = String.format("arg2_dim_%d", dim);
+ // features.add(new Feature(featureName, arg2Vector.getValue(dim)));
+ // }
+
+ // head word similarity features
+// List<WordToken> wordsOfArgs1 = JCasUtil.selectCovered(jCas, WordToken.class, arg1);
+// List<Double> arg1Vec = getGroupVector(wordsOfArgs1);
+
+// List<WordToken> wordsOfArgs2 = JCasUtil.selectCovered(jCas, WordToken.class, arg2);
+// List<Double> arg2Vec = getGroupVector(wordsOfArgs2);
+
+// for(int dim = 0; dim < numberOfDimensions; dim++) {
+// String featureName = String.format("arg1_dim_%d", dim);
+// features.add(new Feature(featureName, arg1Vec.get(dim)));
+// }
+// for(int dim = 0; dim < numberOfDimensions; dim++) {
+// String featureName = String.format("arg2_dim_%d", dim);
+// features.add(new Feature(featureName, arg2Vec.get(dim)));
+// }
+
+
+// double similarity = computeCosineSimilarity(arg1Vec, arg2Vec);
+// features.add(new Feature("arg_cos_sim", similarity));
+
+
+ // words between argument features
+ List<WordToken> wordsBetweenArgs = JCasUtil.selectBetween(jCas, WordToken.class, arg1, arg2);
+// wordsBetweenArgs.addAll(wordsOfArgs1);
+// wordsBetweenArgs.addAll(wordsOfArgs2);
+ if(wordsBetweenArgs.size() < 1) {
+ return features;
+ }
+
+ List<Double> sum = new ArrayList<>(Collections.nCopies(numberOfDimensions, 0.0));
+ for(WordToken wordToken : wordsBetweenArgs) {
+ WordVector wordVector;
+ if(words.containsKey(wordToken.getCoveredText().toLowerCase())) {
+ wordVector = words.getVector(wordToken.getCoveredText().toLowerCase());
+ } else {
+ wordVector = words.getVector("and");
+ }
+ sum = addVectors(sum, wordVector);
+ }
+
+ for(int dim = 0; dim < numberOfDimensions; dim++) {
+ String featureName = String.format("average_dim_%d", dim);
+ features.add(new Feature(featureName, sum.get(dim) / wordsBetweenArgs.size()));
+ }
+
+ return features;
+ }
+
+ /**
+ private List<Double> getGroupVector(List<WordToken> wordsOfArgs) {
+ List<Double> argVec = new ArrayList<>(Collections.nCopies(numberOfDimensions, 0.0));
+
+ for(WordToken wordToken : wordsOfArgs) {
+ WordVector wordVector;
+ if(words.containsKey(wordToken.getCoveredText().toLowerCase())) {
+ wordVector = words.getVector(wordToken.getCoveredText().toLowerCase());
+ } else {
+ wordVector = words.getVector("and");
+ }
+ argVec = addVectors(argVec, wordVector);
+ }
+ int numOfWords = wordsOfArgs.size();
+ if( numOfWords > 1){
+ for(int dim = 0; dim < numberOfDimensions; dim++) {
+ argVec.set(dim, argVec.get(dim) /numOfWords) ;
+ }
+ }
+ return argVec;
+ }*/
+
+ /**
+ * Compute cosine similarity between two vectors.
+ */
+ public double computeCosineSimilarity(WordVector vector1, WordVector vector2) {
+
+ double dotProduct = 0.0;
+ double norm1 = 0.01;
+ double norm2 = 0.01;
+
+ for (int dim = 0; dim < numberOfDimensions; dim++) {
+ dotProduct = dotProduct + vector1.getValue(dim) * vector2.getValue(dim);
+ norm1 = norm1 + Math.pow(vector1.getValue(dim), 2);
+ norm2 = norm2 + Math.pow(vector2.getValue(dim), 2);
+ }
+
+ return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
+ }
+
+ public double computeCosineSimilarity(List<Double> vector1, List<Double> vector2) {
+
+ double dotProduct = 0.0;
+ double norm1 = 0.01;
+ double norm2 = 0.01;
+
+ for (int dim = 0; dim < numberOfDimensions; dim++) {
+ dotProduct = dotProduct + vector1.get(dim) * vector2.get(dim);
+ norm1 = norm1 + Math.pow(vector1.get(dim), 2);
+ norm2 = norm2 + Math.pow(vector2.get(dim), 2);
+ }
+
+ return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
+ }
+
+ /**
+ * Add two vectors. Return the sum vector.
+ */
+ public List<Double> addVectors(List<Double> vector1, WordVector vector2) {
+
+ List<Double> sum = new ArrayList<>();
+ for(int dim = 0; dim < numberOfDimensions; dim++) {
+ sum.add(vector1.get(dim) + vector2.getValue(dim));
+ }
+
+ return sum;
+ }
+}