You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/02/06 23:13:34 UTC
svn commit: r1565460 - in
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines:
GenerateDependencyRepresentation.java GenerateTreeRepresentation.java
Author: tmill
Date: Thu Feb 6 22:13:34 2014
New Revision: 1565460
URL: http://svn.apache.org/r1565460
Log:
CTAKES-94: Refactored code for printing constituency/dependency representations for training.
Added:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java
Modified:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java?rev=1565460&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateDependencyRepresentation.java Thu Feb 6 22:13:34 2014
@@ -0,0 +1,133 @@
+package org.apache.ctakes.assertion.pipelines;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.ctakes.assertion.eval.XMIReader;
+import org.apache.ctakes.assertion.pipelines.GenerateTreeRepresentation.ATTRIBUTE;
+import org.apache.ctakes.assertion.util.AssertionDepUtils;
+import org.apache.ctakes.assertion.util.AssertionTreeUtils;
+import org.apache.ctakes.assertion.util.SemanticClasses;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.tree.SimpleTree;
+import org.apache.log4j.Logger;
+import org.apache.uima.UIMAException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+public class GenerateDependencyRepresentation {
+ public static class Options {
+
+ @Option(
+ name = "--train-dir",
+ usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
+ required = true)
+ public File trainDirectory;
+
+ @Option(
+ name = "--output",
+ usage = "The file to which the data points be written.",
+ required = true)
+ public File outFile;
+
+ @Option(name = "--attribute", required=false)
+ public ATTRIBUTE attributeType = ATTRIBUTE.NEG;
+ }
+
+ protected static Options options = new Options();
+ private static SemanticClasses sems = null;
+ private static PrintStream out = null;
+ private static Logger log = Logger.getLogger(GenerateDependencyRepresentation.class);
+
+ /**
+ * @param args
+ * @throws CmdLineException
+ */
+ public static void main(String[] args) throws UIMAException, IOException, CmdLineException {
+ CmdLineParser optionParser = new CmdLineParser(options);
+ optionParser.parseArgument(args);
+
+ out = new PrintStream(options.outFile);
+ List<File> trainFiles = Arrays.asList(options.trainDirectory.listFiles());
+ if(sems == null){
+ sems = new SemanticClasses(FileLocator.getAsStream("org/apache/ctakes/assertion/all_cues.txt"));
+ }
+
+ String[] paths = new String[trainFiles.size()];
+ for (int i = 0; i < paths.length; ++i) {
+ paths[i] = trainFiles.get(i).getPath();
+ }
+ CollectionReader reader = CollectionReaderFactory.createCollectionReader(
+ XMIReader.class,
+ XMIReader.PARAM_FILES,
+ paths);
+
+ JCasIterable casIter = new JCasIterable(reader);
+ while(casIter.hasNext()){
+ JCas jcas = casIter.next();
+// String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+// out.println("## Document id: " + docId);
+ processDocument(jcas);
+ }
+ out.close();
+
+ }
+
+ public static void processDocument(JCas jcas) {
+ log.info("Processing document: " + DocumentIDAnnotationUtil.getDocumentID(jcas));
+ Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
+ for(Sentence sent : sents){
+ List<ConllDependencyNode> nodes = JCasUtil.selectCovered(jcas, ConllDependencyNode.class, sent);
+
+ // now that we've bult the tree, let's get the sub-trees for each concept:
+ List<IdentifiedAnnotation> mentions = new ArrayList<IdentifiedAnnotation>(JCasUtil.selectCovered(EventMention.class, sent));
+ mentions.addAll(JCasUtil.selectCovered(EntityMention.class, sent));
+
+ for(IdentifiedAnnotation mention : mentions){
+
+ SimpleTree tree = AssertionDepUtils.getTokenTreeString(jcas, nodes, mention);
+
+// String treeStr = AnnotationDepUtils.getTokenRelTreeString(jcas, nodes, new Annotation[]{mention}, new String[]{"CONCEPT"}, true);
+
+ if(tree == null) continue;
+ AssertionTreeUtils.replaceDependencyWordsWithSemanticClasses(tree, sems);
+ String label = "-1";
+
+
+ if(options.attributeType == ATTRIBUTE.NEG && mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT ||
+ options.attributeType == ATTRIBUTE.UNC && mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT){
+ label = "+1";
+ }
+
+ out.print(label);
+ out.print(" |BT| ");
+ out.print(tree.toString()); //tree.toString());
+ out.println(" |ET|");
+ out.flush();
+// // restore cat name:
+// node2tree.get(headNode).cat = realCat;
+
+ }
+
+// out.println(node2tree.get(rootNode).toString());
+ }
+ }
+}
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java?rev=1565460&r1=1565459&r2=1565460&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GenerateTreeRepresentation.java Thu Feb 6 22:13:34 2014
@@ -12,19 +12,19 @@ import java.util.List;
import org.apache.ctakes.assertion.eval.XMIReader;
import org.apache.ctakes.assertion.util.SemanticClasses;
-import org.apache.ctakes.constituency.parser.util.AnnotationTreeUtils;
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.typesystem.type.constants.CONST;
-import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.utils.tree.SimpleTree;
import org.apache.log4j.Logger;
import org.apache.uima.UIMAException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.util.Options_ImplBase;
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.uimafit.factory.CollectionReaderFactory;
import org.uimafit.pipeline.JCasIterable;
@@ -32,7 +32,9 @@ import org.uimafit.util.JCasUtil;
public class GenerateTreeRepresentation{
- public static class Options extends Options_ImplBase {
+ enum ATTRIBUTE {NEG, UNC}
+
+ public static class Options {
@Option(
name = "--train-dir",
@@ -45,6 +47,9 @@ public class GenerateTreeRepresentation{
usage = "The file to which the data points be written.",
required = true)
public File outFile;
+
+ @Option(name = "--attribute", required=false)
+ public ATTRIBUTE attributeType = ATTRIBUTE.NEG;
}
protected static Options options = new Options();
@@ -58,8 +63,17 @@ public class GenerateTreeRepresentation{
* @throws UIMAException
*/
public static void main(String[] args) throws UIMAException, IOException {
- options.parseOptions(args);
+ CmdLineParser parser = new CmdLineParser(options);
+ try {
+ parser.parseArgument(args);
+ } catch (CmdLineException e) {
+ e.printStackTrace();
+ System.exit(-1);
+ }
+ if(sems == null){
+ sems = new SemanticClasses(FileLocator.getAsStream("org/apache/ctakes/assertion/all_cues.txt"));
+ }
out = new PrintStream(options.outFile);
List<File> trainFiles = Arrays.asList(options.trainDirectory.listFiles());
@@ -80,31 +94,52 @@ public class GenerateTreeRepresentation{
out.close();
}
- public static void processDocument(JCas jcas) throws ResourceInitializationException, FileNotFoundException {
+ public static void processDocument(JCas jcas) {
log.info("Processing document: " + DocumentIDAnnotationUtil.getDocumentID(jcas));
- if(sems == null){
- sems = new SemanticClasses(FileLocator.locateFile("org/apache/ctakes/assertion/models/semantic_classes").getAbsolutePath());
- }
Collection<IdentifiedAnnotation> mentions = JCasUtil.select(jcas, IdentifiedAnnotation.class);
for(IdentifiedAnnotation mention : mentions){
- TopTreebankNode orig = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
- if(orig == null){
- log.warn("Tree for entity mention: " + mention.getCoveredText() + " (" + mention.getBegin() + "-" + mention.getEnd() + ") is null.");
- continue;
- }
- SimpleTree tree = extractAboveLeftConceptTree(jcas, mention, sems);
-// if(mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT){
- if(mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT){
- out.print("+1 ");
- }else{
- out.print("-1 ");
- }
-
- out.print("|BT| ");
- out.print(tree.toString());
- out.println(" |ET|");
- out.flush();
+ if(mention instanceof EventMention || mention instanceof EntityMention){
+// TopTreebankNode orig = AnnotationTreeUtils.getAnnotationTree(jcas, mention);
+// if(orig == null){
+// log.warn("Tree for entity mention: " + mention.getCoveredText() + " (" + mention.getBegin() + "-" + mention.getEnd() + ") is null.");
+// continue;
+// }
+ SimpleTree tree = null; // extractFeatureTree(jcas, mention, sems);
+// SimpleTree tree = extractAboveLeftConceptTree(jcas, mention, null);
+// SimpleTree tree = AssertionTreeUtils.extractAboveRightConceptTree(jcas, mention, sems);
+ String label = null;
+
+ if(options.attributeType == ATTRIBUTE.NEG){
+ if(mention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT) label = "+1";
+ else label = "-1";
+ tree = getNegationTree(jcas, mention);
+ }else if(options.attributeType == ATTRIBUTE.UNC){
+ if(mention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT) label = "+1";
+ else label = "-1";
+ tree = getUncertaintyTree(jcas, mention);
+ }else{
+ throw new IllegalArgumentException("Do not have this attribute type!");
+ }
+
+ out.print(label);
+ out.print(" |BT| ");
+ out.print(tree.toString());
+ out.println(" |ET|");
+ out.flush();
+ }
}
}
+ private static SimpleTree getUncertaintyTree(JCas jcas, IdentifiedAnnotation mention) {
+ SimpleTree tree = null;
+ tree = extractAboveLeftConceptTree(jcas, mention, sems);
+ return tree;
+ }
+
+ private static SimpleTree getNegationTree(JCas jcas, IdentifiedAnnotation mention) {
+ SimpleTree tree = null;
+ tree = extractAboveLeftConceptTree(jcas, mention, sems);
+ return tree;
+ }
+
}