You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/02/06 23:13:34 UTC

svn commit: r1565460 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines: GenerateDependencyRepresentation.java GenerateTreeRepresentation.java

Author: tmill
Date: Thu Feb  6 22:13:34 2014
New Revision: 1565460

URL: http://svn.apache.org/r1565460
Log:
CTAKES-94: Refactored code for printing constituency/dependency representations for training.

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java?rev=1565460&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java Thu Feb  6 22:13:34 2014
@@ -0,0 +1,133 @@
+package org.apache.ctakes.assertion.pipelines;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.assertion.eval.XMIReader;
+import org.apache.ctakes.assertion.pipelines.GenerateTreeRepresentation.ATTRIBUTE;
+import org.apache.ctakes.assertion.util.AssertionDepUtils;
+import org.apache.ctakes.assertion.util.AssertionTreeUtils;
+import org.apache.ctakes.assertion.util.SemanticClasses;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.log4j.Logger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+public class GenerateDependencyRepresentation {
+  public static class Options {
+
+    @Option(
+        name = "--train-dir",
+        usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
+        required = true)
+    public File trainDirectory;
+    
+    @Option(
+        name = "--output",
+        usage = "The file to which the data points be written.",
+        required = true)
+    public File outFile;
+
+    @Option(name = "--attribute", required=false)
+    public ATTRIBUTE attributeType = ATTRIBUTE.NEG;
+  }
+
+  protected static Options options = new Options();
+  private static SemanticClasses sems = null; 
+  private static PrintStream out = null;
+  private static Logger log = Logger.getLogger(GenerateDependencyRepresentation.class);
+
+  /**
+   * @param args
+   * @throws CmdLineException 
+   */
+  public static void main(String[] args) throws UIMAException, IOException, CmdLineException {
+    CmdLineParser optionParser = new CmdLineParser(options);
+    optionParser.parseArgument(args);
+    
+    out = new PrintStream(options.outFile);
+    List<File> trainFiles = Arrays.asList(options.trainDirectory.listFiles());
+    if(sems == null){
+      sems = new SemanticClasses(FileLocator.getAsStream("org/apache/ctakes/assertion/all_cues.txt"));
+    }
+
+    String[] paths = new String[trainFiles.size()];
+    for (int i = 0; i < paths.length; ++i) {
+      paths[i] = trainFiles.get(i).getPath();
+    }
+    CollectionReader reader = CollectionReaderFactory.createCollectionReader(
+            XMIReader.class,
+            XMIReader.PARAM_FILES,
+            paths);
+       
+    JCasIterable casIter = new JCasIterable(reader);
+    while(casIter.hasNext()){
+      JCas jcas = casIter.next();
+//      String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+//      out.println("## Document id: " + docId);
+      processDocument(jcas);
+    }
+    out.close();
+
+  }
+  
+  public static void processDocument(JCas jcas) {
+    log.info("Processing document: " + DocumentIDAnnotationUtil.getDocumentID(jcas));
+    Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
+    for(Sentence sent : sents){
+      List<ConllDependencyNode> nodes = JCasUtil.selectCovered(jcas, ConllDependencyNode.class, sent);
+
+      // now that we've bult the tree, let's get the sub-trees for each concept:
+      List<IdentifiedAnnotation> mentions = new ArrayList<IdentifiedAnnotation>(JCasUtil.selectCovered(EventMention.class, sent));
+      mentions.addAll(JCasUtil.selectCovered(EntityMention.class, sent));
+      
+      for(IdentifiedAnnotation mention : mentions){
+
+        SimpleTree tree = AssertionDepUtils.getTokenTreeString(jcas, nodes, mention);
+        
+//        String treeStr = AnnotationDepUtils.getTokenRelTreeString(jcas, nodes, new Annotation[]{mention}, new String[]{"CONCEPT"}, true);
+        
+        if(tree == null) continue;
+        AssertionTreeUtils.replaceDependencyWordsWithSemanticClasses(tree, sems);
+        String label = "-1";
+        
+        
+        if(options.attributeType == ATTRIBUTE.NEG && mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT || 
+           options.attributeType == ATTRIBUTE.UNC && mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT){
+          label = "+1";
+        }
+        
+        out.print(label);
+        out.print(" |BT| ");
+        out.print(tree.toString()); //tree.toString());
+        out.println(" |ET|");
+        out.flush();
+//        // restore cat name:
+//        node2tree.get(headNode).cat = realCat;
+
+      }
+      
+//      out.println(node2tree.get(rootNode).toString());
+    }
+  }
+}

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java?rev=1565460&r1=1565459&r2=1565460&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java Thu Feb  6 22:13:34 2014
@@ -12,19 +12,19 @@ import java.util.List;
 
 import org.apache.ctakes.assertion.eval.XMIReader;
 import org.apache.ctakes.assertion.util.SemanticClasses;
-import org.apache.ctakes.constituency.parser.util.AnnotationTreeUtils;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.typesystem.type.constants.CONST;
-import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.utils.tree.SimpleTree;
 import org.apache.log4j.Logger;
 import org.apache.uima.UIMAException;
 import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.util.Options_ImplBase;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
 import org.kohsuke.args4j.Option;
 import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.pipeline.JCasIterable;
@@ -32,7 +32,9 @@ import org.uimafit.util.JCasUtil;
 
 public class GenerateTreeRepresentation{
 
-	public static class Options extends Options_ImplBase {
+  enum ATTRIBUTE {NEG, UNC}
+  
+	public static class Options {
 
 		@Option(
 				name = "--train-dir",
@@ -45,6 +47,9 @@ public class GenerateTreeRepresentation{
 				usage = "The file to which the data points be written.",
 				required = true)
 		public File outFile;
+		
+		@Option(name = "--attribute", required=false)
+		public ATTRIBUTE attributeType = ATTRIBUTE.NEG;
 	}
 	
 	protected static Options options = new Options();
@@ -58,8 +63,17 @@ public class GenerateTreeRepresentation{
 	 * @throws UIMAException 
 	 */
 	public static void main(String[] args) throws UIMAException, IOException {
-	    options.parseOptions(args);
+	    CmdLineParser parser = new CmdLineParser(options);
+	    try {
+        parser.parseArgument(args);
+      } catch (CmdLineException e) {
+        e.printStackTrace();
+        System.exit(-1);
+      }
 	    
+	    if(sems == null){
+	      sems = new SemanticClasses(FileLocator.getAsStream("org/apache/ctakes/assertion/all_cues.txt"));
+	    }
 	    out = new PrintStream(options.outFile);
 	    List<File> trainFiles = Arrays.asList(options.trainDirectory.listFiles());
 
@@ -80,31 +94,52 @@ public class GenerateTreeRepresentation{
 	    out.close();
 	}
 
-	public static void processDocument(JCas jcas) throws ResourceInitializationException, FileNotFoundException {
+	public static void processDocument(JCas jcas) {
 		log.info("Processing document: " + DocumentIDAnnotationUtil.getDocumentID(jcas));
-		if(sems == null){
-			sems = new SemanticClasses(FileLocator.locateFile("org/apache/ctakes/assertion/models/semantic_classes").getAbsolutePath());
-		}
 		Collection<IdentifiedAnnotation> mentions = JCasUtil.select(jcas, IdentifiedAnnotation.class);
 		for(IdentifiedAnnotation mention : mentions){
-			TopTreebankNode orig = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
-			if(orig == null){
-				log.warn("Tree for entity mention: " + mention.getCoveredText() + " (" + mention.getBegin() + "-" + mention.getEnd() + ") is null.");
-				continue;
-			}
-			SimpleTree tree = extractAboveLeftConceptTree(jcas, mention, sems);
-//			if(mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT){
-			if(mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT){
-				out.print("+1 ");
-			}else{
-				out.print("-1 ");
-			}
-			
-			out.print("|BT| ");
-			out.print(tree.toString());
-			out.println(" |ET|");
-			out.flush();
+		  if(mention instanceof EventMention || mention instanceof EntityMention){
+//		    TopTreebankNode orig = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+//		    if(orig == null){
+//		      log.warn("Tree for entity mention: " + mention.getCoveredText() + " (" + mention.getBegin() + "-" + mention.getEnd() + ") is null.");
+//		      continue;
+//		    }
+		    SimpleTree tree = null; // extractFeatureTree(jcas, mention, sems);
+//		    SimpleTree tree = extractAboveLeftConceptTree(jcas, mention, null);
+//		    			SimpleTree tree = AssertionTreeUtils.extractAboveRightConceptTree(jcas, mention, sems);
+		    String label = null;
+		    
+		    if(options.attributeType == ATTRIBUTE.NEG){ 
+		      if(mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT) label = "+1";
+		      else label = "-1";
+          tree = getNegationTree(jcas, mention);
+		    }else if(options.attributeType == ATTRIBUTE.UNC){
+		      if(mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT) label = "+1";
+		      else label = "-1";
+		      tree = getUncertaintyTree(jcas, mention);
+		    }else{
+		      throw new IllegalArgumentException("Do not have this attribute type!");
+		    }
+
+		    out.print(label);
+		    out.print(" |BT| ");
+		    out.print(tree.toString());
+		    out.println(" |ET|");
+		    out.flush();
+		  }
 		}
 	}
 
+  private static SimpleTree getUncertaintyTree(JCas jcas, IdentifiedAnnotation mention) {
+    SimpleTree tree = null;
+    tree = extractAboveLeftConceptTree(jcas, mention, sems);    
+    return tree;
+  }
+
+  private static SimpleTree getNegationTree(JCas jcas, IdentifiedAnnotation mention) {
+    SimpleTree tree = null;
+    tree = extractAboveLeftConceptTree(jcas, mention, sems);
+    return tree;
+  }
+
 }