You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2016/06/29 20:05:37 UTC
svn commit: r1750710 - in /ctakes/trunk/ctakes-core: ./
src/main/java/org/apache/ctakes/core/cleartk/
Author: tmill
Date: Wed Jun 29 20:05:37 2016
New Revision: 1750710
URL: http://svn.apache.org/viewvc?rev=1750710&view=rev
Log:
Added some cleartk-derived feature extractors for working with embeddings/neural networks.
Added:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java
Modified:
ctakes/trunk/ctakes-core/pom.xml
Modified: ctakes/trunk/ctakes-core/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/pom.xml?rev=1750710&r1=1750709&r2=1750710&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/pom.xml (original)
+++ ctakes/trunk/ctakes-core/pom.xml Wed Jun 29 20:05:37 2016
@@ -109,5 +109,9 @@
<groupId>org.apache.uima</groupId>
<artifactId>uimafit-core</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.cleartk</groupId>
+ <artifactId>cleartk-ml</artifactId>
+ </dependency>
</dependencies>
</project>
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/ContinuousTextExtractor.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,79 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1;
+import org.cleartk.ml.Feature;
+
+
+public class ContinuousTextExtractor implements NamedFeatureExtractor1<BaseToken> {
+ public enum OovStrategy {OOV_FEATURE, EMPTY_VECTOR, MEAN_VECTOR}
+
+ private int dims;
+ private WordEmbeddings words = null;
+ private OovStrategy oovStrategy = null;
+
+ public ContinuousTextExtractor(String vecFile) throws
+ CleartkExtractorException {
+ this(vecFile, OovStrategy.OOV_FEATURE);
+ }
+
+ public ContinuousTextExtractor(String vecFile, OovStrategy oovStrategy) throws
+ CleartkExtractorException {
+ super();
+ try {
+ words =
+ WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile));
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new CleartkExtractorException(e);
+ }
+ this.oovStrategy = oovStrategy;
+ }
+ @Override
+ public List<Feature> extract(JCas view, BaseToken token) throws
+ CleartkExtractorException {
+ List<Feature> feats = new ArrayList<>();
+
+ String wordText = token.getCoveredText();
+ WordVector vec = null;
+ if(words.containsKey(wordText)){
+ vec = words.getVector(wordText);
+ }else if(words.containsKey(wordText.toLowerCase())){
+ vec = words.getVector(wordText.toLowerCase());
+ }else{
+ if(this.oovStrategy == OovStrategy.OOV_FEATURE){
+ feats.add(new Feature(getFeatureName(), "OOV"));
+ return feats;
+ }else if(this.oovStrategy == OovStrategy.EMPTY_VECTOR){
+ vec = new WordVector("_empty_", new double[words.getDimensionality()]);
+ }else if(this.oovStrategy == OovStrategy.MEAN_VECTOR){
+ vec = words.getMeanVector();
+ }
+ }
+
+ for(int i = 0; i < vec.size(); i++){
+ feats.add(new Feature(getFeatureName() + "_" + i, vec.getValue(i)));
+ }
+ return feats;
+ }
+
+ public int getEmbeddingsDimensionality(){
+ return words.getDimensionality();
+ }
+
+ @Override
+ public String getFeatureName() {
+ return "ContinuousText";
+ }
+
+}
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/FollowingWithPadding.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,59 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class FollowingWithPadding extends Following {
+
+ public int dims;
+
+ public FollowingWithPadding(int end, int dims) {
+ super(end);
+ this.dims = dims;
+ }
+
+ @Override
+ public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+ Annotation focusAnnotation, Bounds bounds,
+ Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+ throws CleartkExtractorException {
+ LinkedList<Feature> rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor));
+ List<Feature> processedFeats = new ArrayList<>();
+
+ for(Feature feat : rawFeats){
+ if(feat.getValue().toString().startsWith("OOB")){
+ // add one feature for each dimension and set it to 0.
+ for(int j = 0; j < this.dims; j++){
+ processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0));
+ }
+ }else{
+ processedFeats.add(feat);
+ }
+ }
+ return processedFeats;
+ }
+
+ /*
+ @Override
+ protected <T extends Annotation> List<T> select(JCas jCas,
+ Annotation focusAnnotation, Class<T> annotationClass, int count) {
+ List<T> validList = super.select(jCas, focusAnnotation, annotationClass, count);
+
+ // Pad the end of the list with repeats of the last element
+ while(validList.size() < count){
+ validList.add(validList.get(validList.size()-1));
+ }
+
+ return validList;
+ }
+ */
+}
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MaxContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,74 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MaxContext implements CleartkExtractor.Context {
+
+ private Context[] contexts;
+
+ private String name;
+
+ /**
+ * Constructs a {@link Context} which converts the features extracted by the argument contexts
+ * into a bag of features where all features have the same name.
+ *
+ * @param contexts
+ * The contexts which should be combined into a bag.
+ */
+ public MaxContext(Context... contexts) {
+ this.contexts = contexts;
+ String[] names = new String[contexts.length + 1];
+ names[0] = "Max";
+ for (int i = 1; i < names.length; ++i) {
+ names[i] = contexts[i - 1].getName();
+ }
+ this.name = Feature.createName(names);
+ }
+
+ public String getName() {
+ return this.name;
+ }
+
+ public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+ Annotation focusAnnotation, Bounds bounds,
+ Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+ throws CleartkExtractorException {
+ HashMap<String,Double> runningTotals = new HashMap<>();
+
+ for (Context context : this.contexts) {
+ for (Feature feature : context.extract(
+ jCas,
+ focusAnnotation,
+ bounds,
+ annotationClass,
+ extractor)) {
+ try{
+ double val = Double.parseDouble(feature.getValue().toString());
+ if(!runningTotals.containsKey(feature.getName())){
+ runningTotals.put(feature.getName(), 0.0);
+ }
+ runningTotals.put(feature.getName(), Double.max(runningTotals.get(feature.getName()), val));
+ }catch(Exception e){
+ // just ignore this feature?
+ }
+ }
+ }
+ List<Feature> features = new ArrayList<>();
+ for(String key : runningTotals.keySet()){
+ features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+ }
+ return features;
+ }
+
+}
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/MinContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,74 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class MinContext implements CleartkExtractor.Context {
+
+ private Context[] contexts;
+
+ private String name;
+
+ /**
+ * Constructs a {@link Context} which converts the features extracted by the argument contexts
+ * into a bag of features where all features have the same name.
+ *
+ * @param contexts
+ * The contexts which should be combined into a bag.
+ */
+ public MinContext(Context... contexts) {
+ this.contexts = contexts;
+ String[] names = new String[contexts.length + 1];
+ names[0] = "Min";
+ for (int i = 1; i < names.length; ++i) {
+ names[i] = contexts[i - 1].getName();
+ }
+ this.name = Feature.createName(names);
+ }
+
+ public String getName() {
+ return this.name;
+ }
+
+ public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+ Annotation focusAnnotation, Bounds bounds,
+ Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+ throws CleartkExtractorException {
+ HashMap<String,Double> runningTotals = new HashMap<>();
+
+ for (Context context : this.contexts) {
+ for (Feature feature : context.extract(
+ jCas,
+ focusAnnotation,
+ bounds,
+ annotationClass,
+ extractor)) {
+ try{
+ double val = Double.parseDouble(feature.getValue().toString());
+ if(!runningTotals.containsKey(feature.getName())){
+ runningTotals.put(feature.getName(), 0.0);
+ }
+ runningTotals.put(feature.getName(), Double.min(runningTotals.get(feature.getName()), val));
+ }catch(Exception e){
+ // just ignore this feature?
+ }
+ }
+ }
+ List<Feature> features = new ArrayList<>();
+ for(String key : runningTotals.keySet()){
+ features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+ }
+ return features;
+ }
+
+}
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/PrecedingWithPadding.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,44 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+
+public class PrecedingWithPadding extends Preceding {
+
+ public int dims;
+
+ public PrecedingWithPadding(int end, int dims){
+ super(0, end);
+ this.dims = dims;
+ }
+
+ @Override
+ public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+ Annotation focusAnnotation, Bounds bounds,
+ Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+ throws CleartkExtractorException {
+ LinkedList<Feature> rawFeats = new LinkedList<>(super.extract(jCas, focusAnnotation, bounds, annotationClass, extractor));
+ List<Feature> processedFeats = new ArrayList<>();
+
+ for(Feature feat : rawFeats){
+ if(feat.getValue().toString().startsWith("OOB")){
+ // add one feature for each dimension and set it to 0.
+ for(int j = 0; j < this.dims; j++){
+ processedFeats.add(new Feature(feat.getName() + "_" + j, 0.0));
+ }
+ }else{
+ processedFeats.add(feat);
+ }
+ }
+ return processedFeats;
+ }
+}
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java?rev=1750710&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cleartk/SumContext.java Wed Jun 29 20:05:37 2016
@@ -0,0 +1,75 @@
+package org.apache.ctakes.core.cleartk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class SumContext implements CleartkExtractor.Context {
+
+ private Context[] contexts;
+
+ private String name;
+
+ /**
+ * Constructs a {@link Context} which converts the features extracted by the argument contexts
+ * into a bag of features where all features have the same name.
+ *
+ * @param contexts
+ * The contexts which should be combined into a bag.
+ */
+ public SumContext(Context... contexts) {
+ this.contexts = contexts;
+ String[] names = new String[contexts.length + 1];
+ names[0] = "Sum";
+ for (int i = 1; i < names.length; ++i) {
+ names[i] = contexts[i - 1].getName();
+ }
+ this.name = Feature.createName(names);
+ }
+
+ public String getName() {
+ return this.name;
+ }
+
+ public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas,
+ Annotation focusAnnotation, Bounds bounds,
+ Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor)
+ throws CleartkExtractorException {
+ LinkedHashMap<String,Double> runningTotals = new LinkedHashMap<>();
+
+ for (Context context : this.contexts) {
+ for (Feature feature : context.extract(
+ jCas,
+ focusAnnotation,
+ bounds,
+ annotationClass,
+ extractor)) {
+ try{
+ double val = Double.parseDouble(feature.getValue().toString());
+ if(!runningTotals.containsKey(feature.getName())){
+ runningTotals.put(feature.getName(), 0.0);
+ }
+ runningTotals.put(feature.getName(), runningTotals.get(feature.getName()) + val);
+ }catch(Exception e){
+ // just ignore this feature?
+ }
+ }
+ }
+ List<Feature> features = new ArrayList<>();
+ for(String key : runningTotals.keySet()){
+ features.add(new Feature(this.name + "_" + key, runningTotals.get(key)));
+ }
+ return features;
+ }
+
+}