You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/22 16:21:07 UTC

svn commit: r1485219 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util: AssertionTreeUtils.java SemanticClasses.java

Author: tmill
Date: Wed May 22 14:21:07 2013
New Revision: 1485219

URL: http://svn.apache.org/r1485219
Log:
addresses ctakes-154: Support classes for extracting tree fragment features.

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java   (with props)
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java   (with props)

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java?rev=1485219&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java Wed May 22 14:21:07 2013
@@ -0,0 +1,118 @@
+package org.apache.ctakes.assertion.util;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.ctakes.constituency.parser.treekernel.TreeExtractor;
+import org.apache.ctakes.constituency.parser.util.AnnotationTreeUtils;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class AssertionTreeUtils {
+
+	public static SimpleTree extractAboveLeftConceptTree(JCas jcas, Annotation mention, SemanticClasses sems){
+		SimpleTree tree = null;
+		TopTreebankNode annotationTree = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+		if(annotationTree != null){
+			TopTreebankNode root = AnnotationTreeUtils.getTreeCopy(jcas, annotationTree);
+			TreebankNode conceptNode = AnnotationTreeUtils.insertAnnotationNode(jcas, root, mention, "CONCEPT");
+			// navigate up the tree to retrieve the first "S" above this node.
+			TreebankNode node = conceptNode;
+			while(node.getParent() != null && !node.getNodeType().startsWith("S")){
+				node =  node.getParent();
+			}
+
+			// remove nodes to the right of the CONCEPT node
+			AnnotationTreeUtils.removeRightOfAnnotation(jcas, node, conceptNode);
+			
+			tree = TreeExtractor.getSimpleClone(node);
+		}else{
+			tree = SimpleTree.fromString("(S noparse)");
+		}
+
+		TreeExtractor.lowercaseWords(tree);
+		if(sems != null){
+			replaceWordsWithSemanticClasses(tree, sems);
+		}
+		return tree;
+	}
+	
+	public static SimpleTree extractAboveRightConceptTree(JCas jcas, Annotation mention, SemanticClasses sems){
+		SimpleTree tree = null;
+		TopTreebankNode annotationTree = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+		if(annotationTree != null){
+			TopTreebankNode root = AnnotationTreeUtils.getTreeCopy(jcas, annotationTree);
+			TreebankNode conceptNode = AnnotationTreeUtils.insertAnnotationNode(jcas, root, mention, "CONCEPT");
+			//						SimpleTree tree = null;
+			//						tree = TreeExtractor.getSurroundingTreeWithAnnotation(node, "CONCEPT");
+			// navigate up the tree to retrieve the first "S" above this node.
+			TreebankNode node = conceptNode;
+			while(node.getParent() != null && !node.getNodeType().startsWith("S")){
+				node =  node.getParent();
+			}
+
+			// get the VP node (clause) or S that most closely dominates the concept, and remove everything after that
+			// should smallen the tree while also permitting post-mention negation like "problem resolved" or "problem ruled out"
+			
+			// remove nodes to the right of the CONCEPT node
+			AnnotationTreeUtils.removeLeftOfAnnotation(jcas, node, conceptNode);
+		
+			tree = TreeExtractor.getSimpleClone(node);
+		}else{
+			tree = SimpleTree.fromString("(S noparse)");
+		}
+
+		TreeExtractor.lowercaseWords(tree);
+		if(sems != null){
+			replaceWordsWithSemanticClasses(tree, sems);
+		}
+		return tree;
+	}
+	
+	public static void replaceWordsWithSemanticClasses(SimpleTree tree, SemanticClasses sems){
+		// recursion base case... actually apply semantic classes...
+		if(tree.isLeaf()){
+			for(Map.Entry<String,HashSet<String>> semClass : sems.entrySet()){
+				if(semClass.getValue().contains(tree.cat)){
+					tree.cat = "semclass_" + semClass.getKey();
+				}
+			}
+		}else{
+			// iterate over children
+			for(SimpleTree child : tree.children){
+				replaceWordsWithSemanticClasses(child, sems);
+			}
+		}
+	}
+	
+	static HashMap<String,String> wordMap = new HashMap<String,String>();
+    static Random random = new Random();
+	public void randomizeWords(SimpleTree tree, boolean dep) {
+		if(!tree.cat.equals("CONCEPT") && !tree.cat.equals("TOP") && (dep || tree.children.size() == 0)){
+			if(wordMap.containsKey(tree.cat)){
+				tree.cat = wordMap.get(tree.cat);
+			}else{
+				// generate new random word... (from http://stackoverflow.com/a/4952066)
+				String oldWord = tree.cat;
+				char[] word = new char[random.nextInt(8)+3]; // words of length 3 through 10. (1 and 2 letter words are boring.)
+				for(int j = 0; j < word.length; j++)
+				{
+					word[j] = (char)('a' + random.nextInt(26));
+				}
+				tree.cat = new String(word);
+				wordMap.put(oldWord, tree.cat);
+			}
+		}
+		if(tree.children.size() > 0){
+			for(SimpleTree child : tree.children){
+				randomizeWords(child, dep);
+			}
+		}
+	}
+
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java?rev=1485219&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java Wed May 22 14:21:07 2013
@@ -0,0 +1,45 @@
+package org.apache.ctakes.assertion.util;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Scanner;
+
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class SemanticClasses extends HashMap<String,HashSet<String>>{
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+
+	// loads files in the input directory into a hashmap that maps the filename minus the extension ("allergy.txt" becomes "allergy")
+	// to the set of words in that file ("allergy" => ("allergic", "allergies", "allergy", ...)
+	public SemanticClasses(String semClassDir) throws ResourceInitializationException{
+		File classDir = new File(semClassDir);
+		if(classDir.exists() && classDir.isDirectory()){
+			File[] classFiles = classDir.listFiles();
+			for(File semClass : classFiles){
+				if(semClass.isDirectory() || semClass.isHidden()) continue;
+				HashSet<String> classWords = new HashSet<String>();
+				Scanner scanner = null;
+				try {
+					scanner = new Scanner(semClass);
+				} catch (FileNotFoundException e) {
+					e.printStackTrace();
+					throw new ResourceInitializationException("Error: Could not open file:", new Object[]{ semClass}, e);
+				}
+				while(scanner.hasNextLine()){
+					String term = scanner.nextLine().trim();
+					// if the term on this line is a multi-word expression, ignore, because we can't
+					// place these in the tree anyways
+					if(!term.contains(" ")){
+						classWords.add(term);
+					}
+				}
+				put(semClass.getName().replace(".txt", ""), classWords);
+			}
+		}
+	}
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain