You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/06/29 20:05:37 UTC

svn commit: r1750710 - in /ctakes/trunk/ctakes-core: ./ src/main/java/org/apache/ctakes/core/cleartk/

Author: tmill
Date: Wed Jun 29 20:05:37 2016
New Revision: 1750710

URL: http://svn.apache.org/viewvc?rev=1750710&view=rev
Log:
Added some cleartk-derived feature extractors for working with embeddings/neural networks.

Added:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java
Modified:
    ctakes/trunk/ctakes-core/pom.xml

Modified: ctakes/trunk/ctakes-core/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/pom.xml?rev=1750710&r1=1750709&r2=1750710&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/pom.xml (original)
+++ ctakes/trunk/ctakes-core/pom.xml Wed Jun 29 20:05:37 2016
@@ -109,5 +109,9 @@
 			<groupId>org.apache.uima</groupId>
 			<artifactId>uimafit-core</artifactId>
 		</dependency>
+		<dependency>
+			<groupId>org.cleartk</groupId>
+			<artifactId>cleartk-ml</artifactId>
+		</dependency>
 	</dependencies>
 </project>

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,79 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1;
+import org.cleartk.ml.Feature;
+
+
+public class ContinuousTextExtractor implements NamedFeatureExtractor1<BaseToken>  {
+  public enum OovStrategy {OOV_FEATURE, EMPTY_VECTOR, MEAN_VECTOR}
+  
+	private int dims;
+	private WordEmbeddings words = null;
+	private OovStrategy oovStrategy = null;
+	
+  public ContinuousTextExtractor(String vecFile) throws
+  CleartkExtractorException {
+    this(vecFile, OovStrategy.OOV_FEATURE);
+  }
+  
+    public ContinuousTextExtractor(String vecFile, OovStrategy oovStrategy) throws
+	CleartkExtractorException {
+		super();
+		try {
+			words =
+					WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile));
+		} catch (IOException e) {
+			e.printStackTrace();
+			throw new CleartkExtractorException(e);
+		}
+		this.oovStrategy = oovStrategy;
+	}
+	@Override
+	public List<Feature> extract(JCas view, BaseToken token) throws
+	CleartkExtractorException {
+		List<Feature> feats = new ArrayList<>();
+
+		String wordText = token.getCoveredText();
+		WordVector vec = null;
+		if(words.containsKey(wordText)){
+			vec = words.getVector(wordText);
+		}else if(words.containsKey(wordText.toLowerCase())){
+			vec = words.getVector(wordText.toLowerCase());
+		}else{
+		  if(this.oovStrategy == OovStrategy.OOV_FEATURE){
+		    feats.add(new Feature(getFeatureName(), "OOV"));
+		    return feats;
+		  }else if(this.oovStrategy == OovStrategy.EMPTY_VECTOR){
+		    vec = new WordVector("_empty_", new double[words.getDimensionality()]);
+		  }else if(this.oovStrategy == OovStrategy.MEAN_VECTOR){
+		    vec = words.getMeanVector();
+		  }
+		}
+
+		for(int i = 0; i < vec.size(); i++){
+			feats.add(new Feature(getFeatureName() + "_" + i, vec.getValue(i)));
+		}
+		return feats;
+	}
+
+	public int getEmbeddingsDimensionality(){
+	  return words.getDimensionality();
+	}
+	
+	@Override
+	public String getFeatureName() {
+		return "ContinuousText";
+	}
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,59 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class FollowingWithPadding extends Following {
+
+  public int dims;
+  
+  public FollowingWithPadding(int end, int dims) {
+    super(end);
+    this.dims = dims;
+  }
+  
+  @Override
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+      Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+      throws CleartkExtractorException {
+    LinkedList<Feature> rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor));
+    List<Feature> processedFeats = new ArrayList<>();
+
+    for(Feature feat : rawFeats){
+      if(feat.getValue().toString().startsWith("OOB")){
+        // add one feature for each dimension and set it to 0.
+        for(int j = 0; j < this.dims; j++){
+          processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0));
+        }
+      }else{
+        processedFeats.add(feat);
+      }
+    }
+    return processedFeats;
+  }
+
+  /*
+  @Override
+  protected <T extends Annotation> List<T> select(JCas jCas,
+      Annotation focusAnnotation, Class<T> annotationClass, int count) {
+    List<T> validList = super.select(jCas, focusAnnotation, annotationClass, count);
+    
+    // Pad the end of the list with repeats of the last element
+    while(validList.size() < count){
+      validList.add(validList.get(validList.size()-1));
+    }
+    
+    return validList;
+  }
+  */
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,74 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MaxContext implements CleartkExtractor.Context {
+
+  private Context[] contexts;
+
+  private String name;
+
+  /**
+   * Constructs a {@link Context} which converts the features extracted by the argument contexts
+   * into a bag of features where all features have the same name.
+   * 
+   * @param contexts
+   *          The contexts which should be combined into a bag.
+   */
+  public MaxContext(Context... contexts) {
+    this.contexts = contexts;
+    String[] names = new String[contexts.length + 1];
+    names[0] = "Max";
+    for (int i = 1; i < names.length; ++i) {
+      names[i] = contexts[i - 1].getName();
+    }
+    this.name = Feature.createName(names);
+  }
+
+  public String getName() {
+    return this.name;
+  }
+
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+      Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+      throws CleartkExtractorException {
+    HashMap<String,Double> runningTotals = new HashMap<>();
+
+    for (Context context : this.contexts) {
+      for (Feature feature : context.extract(
+          jCas,
+          focusAnnotation,
+          bounds,
+          annotationClass,
+          extractor)) {
+        try{
+          double val = Double.parseDouble(feature.getValue().toString());
+          if(!runningTotals.containsKey(feature.getName())){
+            runningTotals.put(feature.getName(), 0.0);
+          }
+          runningTotals.put(feature.getName(), Double.max(runningTotals.get(feature.getName()), val));
+        }catch(Exception e){
+          // just ignore this feature?
+        }
+      }
+    }
+    List<Feature> features = new ArrayList<>();
+    for(String key : runningTotals.keySet()){
+      features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+    }
+    return features;
+  }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,74 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MinContext implements CleartkExtractor.Context {
+
+  private Context[] contexts;
+
+  private String name;
+
+  /**
+   * Constructs a {@link Context} which converts the features extracted by the argument contexts
+   * into a bag of features where all features have the same name.
+   * 
+   * @param contexts
+   *          The contexts which should be combined into a bag.
+   */
+  public MinContext(Context... contexts) {
+    this.contexts = contexts;
+    String[] names = new String[contexts.length + 1];
+    names[0] = "Min";
+    for (int i = 1; i < names.length; ++i) {
+      names[i] = contexts[i - 1].getName();
+    }
+    this.name = Feature.createName(names);
+  }
+
+  public String getName() {
+    return this.name;
+  }
+
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+      Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+      throws CleartkExtractorException {
+    HashMap<String,Double> runningTotals = new HashMap<>();
+
+    for (Context context : this.contexts) {
+      for (Feature feature : context.extract(
+          jCas,
+          focusAnnotation,
+          bounds,
+          annotationClass,
+          extractor)) {
+        try{
+          double val = Double.parseDouble(feature.getValue().toString());
+          if(!runningTotals.containsKey(feature.getName())){
+            runningTotals.put(feature.getName(), 0.0);
+          }
+          runningTotals.put(feature.getName(), Double.min(runningTotals.get(feature.getName()), val));
+        }catch(Exception e){
+          // just ignore this feature?
+        }
+      }
+    }
+    List<Feature> features = new ArrayList<>();
+    for(String key : runningTotals.keySet()){
+      features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+    }
+    return features;
+  }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,44 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+
+public class PrecedingWithPadding extends Preceding {
+
+  public int dims;
+  
+  public PrecedingWithPadding(int end, int dims){
+    super(0, end);
+    this.dims = dims;
+  }
+  
+  @Override
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+      Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+      throws CleartkExtractorException {
+    LinkedList<Feature> rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor));
+    List<Feature> processedFeats = new ArrayList<>();
+
+    for(Feature feat : rawFeats){
+      if(feat.getValue().toString().startsWith("OOB")){
+        // add one feature for each dimension and set it to 0.
+        for(int j = 0; j < this.dims; j++){
+          processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0));
+        }
+      }else{
+        processedFeats.add(feat);
+      }
+    }
+    return processedFeats;
+  }  
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,75 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class SumContext implements CleartkExtractor.Context {
+
+  private Context[] contexts;
+
+  private String name;
+
+  /**
+   * Constructs a {@link Context} which converts the features extracted by the argument contexts
+   * into a bag of features where all features have the same name.
+   * 
+   * @param contexts
+   *          The contexts which should be combined into a bag.
+   */
+  public SumContext(Context... contexts) {
+    this.contexts = contexts;
+    String[] names = new String[contexts.length + 1];
+    names[0] = "Sum";
+    for (int i = 1; i < names.length; ++i) {
+      names[i] = contexts[i - 1].getName();
+    }
+    this.name = Feature.createName(names);
+  }
+
+  public String getName() {
+    return this.name;
+  }
+
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+      Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+      throws CleartkExtractorException {
+    LinkedHashMap<String,Double> runningTotals = new LinkedHashMap<>();
+
+    for (Context context : this.contexts) {
+      for (Feature feature : context.extract(
+          jCas,
+          focusAnnotation,
+          bounds,
+          annotationClass,
+          extractor)) {
+        try{
+          double val = Double.parseDouble(feature.getValue().toString());
+          if(!runningTotals.containsKey(feature.getName())){
+            runningTotals.put(feature.getName(), 0.0);
+          }
+          runningTotals.put(feature.getName(), runningTotals.get(feature.getName()) + val);
+        }catch(Exception e){
+          // just ignore this feature?
+        }
+      }
+    }
+    List<Feature> features = new ArrayList<>();
+    for(String key : runningTotals.keySet()){
+      features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+    }
+    return features;
+  }
+
+}