You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/22 16:21:07 UTC
svn commit: r1485219 - in
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util:
AssertionTreeUtils.java SemanticClasses.java
Author: tmill
Date: Wed May 22 14:21:07 2013
New Revision: 1485219
URL: http://svn.apache.org/r1485219
Log:
addresses ctakes-154: Support classes for extracting tree fragment features.
Added:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java (with props)
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java (with props)
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java?rev=1485219&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java Wed May 22 14:21:07 2013
@@ -0,0 +1,118 @@
+package org.apache.ctakes.assertion.util;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.ctakes.constituency.parser.treekernel.TreeExtractor;
+import org.apache.ctakes.constituency.parser.util.AnnotationTreeUtils;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class AssertionTreeUtils {
+
+ public static SimpleTree extractAboveLeftConceptTree(JCas jcas, Annotation mention, SemanticClasses sems){
+ SimpleTree tree = null;
+ TopTreebankNode annotationTree = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+ if(annotationTree != null){
+ TopTreebankNode root = AnnotationTreeUtils.getTreeCopy(jcas, annotationTree);
+ TreebankNode conceptNode = AnnotationTreeUtils.insertAnnotationNode(jcas, root, mention, "CONCEPT");
+ // navigate up the tree to retrieve the first "S" above this node.
+ TreebankNode node = conceptNode;
+ while(node.getParent() != null && !node.getNodeType().startsWith("S")){
+ node = node.getParent();
+ }
+
+ // remove nodes to the right of the CONCEPT node
+ AnnotationTreeUtils.removeRightOfAnnotation(jcas, node, conceptNode);
+
+ tree = TreeExtractor.getSimpleClone(node);
+ }else{
+ tree = SimpleTree.fromString("(S noparse)");
+ }
+
+ TreeExtractor.lowercaseWords(tree);
+ if(sems != null){
+ replaceWordsWithSemanticClasses(tree, sems);
+ }
+ return tree;
+ }
+
+ public static SimpleTree extractAboveRightConceptTree(JCas jcas, Annotation mention, SemanticClasses sems){
+ SimpleTree tree = null;
+ TopTreebankNode annotationTree = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+ if(annotationTree != null){
+ TopTreebankNode root = AnnotationTreeUtils.getTreeCopy(jcas, annotationTree);
+ TreebankNode conceptNode = AnnotationTreeUtils.insertAnnotationNode(jcas, root, mention, "CONCEPT");
+ // SimpleTree tree = null;
+ // tree = TreeExtractor.getSurroundingTreeWithAnnotation(node, "CONCEPT");
+ // navigate up the tree to retrieve the first "S" above this node.
+ TreebankNode node = conceptNode;
+ while(node.getParent() != null && !node.getNodeType().startsWith("S")){
+ node = node.getParent();
+ }
+
+ // get the VP node (clause) or S that most closely dominates the concept, and remove everything after that
+ // should smallen the tree while also permitting post-mention negation like "problem resolved" or "problem ruled out"
+
+ // remove nodes to the right of the CONCEPT node
+ AnnotationTreeUtils.removeLeftOfAnnotation(jcas, node, conceptNode);
+
+ tree = TreeExtractor.getSimpleClone(node);
+ }else{
+ tree = SimpleTree.fromString("(S noparse)");
+ }
+
+ TreeExtractor.lowercaseWords(tree);
+ if(sems != null){
+ replaceWordsWithSemanticClasses(tree, sems);
+ }
+ return tree;
+ }
+
+ public static void replaceWordsWithSemanticClasses(SimpleTree tree, SemanticClasses sems){
+ // recursion base case... actually apply semantic classes...
+ if(tree.isLeaf()){
+ for(Map.Entry<String,HashSet<String>> semClass : sems.entrySet()){
+ if(semClass.getValue().contains(tree.cat)){
+ tree.cat = "semclass_" + semClass.getKey();
+ }
+ }
+ }else{
+ // iterate over children
+ for(SimpleTree child : tree.children){
+ replaceWordsWithSemanticClasses(child, sems);
+ }
+ }
+ }
+
+ static HashMap<String,String> wordMap = new HashMap<String,String>();
+ static Random random = new Random();
+ public void randomizeWords(SimpleTree tree, boolean dep) {
+ if(!tree.cat.equals("CONCEPT") && !tree.cat.equals("TOP") && (dep || tree.children.size() == 0)){
+ if(wordMap.containsKey(tree.cat)){
+ tree.cat = wordMap.get(tree.cat);
+ }else{
+ // generate new random word... (from http://stackoverflow.com/a/4952066)
+ String oldWord = tree.cat;
+ char[] word = new char[random.nextInt(8)+3]; // words of length 3 through 10. (1 and 2 letter words are boring.)
+ for(int j = 0; j < word.length; j++)
+ {
+ word[j] = (char)('a' + random.nextInt(26));
+ }
+ tree.cat = new String(word);
+ wordMap.put(oldWord, tree.cat);
+ }
+ }
+ if(tree.children.size() > 0){
+ for(SimpleTree child : tree.children){
+ randomizeWords(child, dep);
+ }
+ }
+ }
+
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionTreeUtils.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java?rev=1485219&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java Wed May 22 14:21:07 2013
@@ -0,0 +1,45 @@
+package org.apache.ctakes.assertion.util;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Scanner;
+
+import org.apache.uima.resource.ResourceInitializationException;
+
+public class SemanticClasses extends HashMap<String,HashSet<String>>{
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+ // loads files in the input directory into a hashmap that maps the filename minus the extension ("allergy.txt" becomes "allergy")
+ // to the set of words in that file ("allergy" => ("allergic", "allergies", "allergy", ...)
+ public SemanticClasses(String semClassDir) throws ResourceInitializationException{
+ File classDir = new File(semClassDir);
+ if(classDir.exists() && classDir.isDirectory()){
+ File[] classFiles = classDir.listFiles();
+ for(File semClass : classFiles){
+ if(semClass.isDirectory() || semClass.isHidden()) continue;
+ HashSet<String> classWords = new HashSet<String>();
+ Scanner scanner = null;
+ try {
+ scanner = new Scanner(semClass);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ throw new ResourceInitializationException("Error: Could not open file:", new Object[]{ semClass}, e);
+ }
+ while(scanner.hasNextLine()){
+ String term = scanner.nextLine().trim();
+ // if the term on this line is a multi-word expression, ignore, because we can't
+ // place these in the tree anyways
+ if(!term.contains(" ")){
+ classWords.add(term);
+ }
+ }
+ put(semClass.getName().replace(".txt", ""), classWords);
+ }
+ }
+ }
+}
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/SemanticClasses.java
------------------------------------------------------------------------------
svn:mime-type = text/plain