You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/04/29 15:15:19 UTC
svn commit: r1477043 - in
/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser:
MaxentParserWrapper.java util/TreeUtils.java
Author: tmill
Date: Mon Apr 29 13:15:19 2013
New Revision: 1477043
URL: http://svn.apache.org/r1477043
Log:
cTAKES-193: Fix bug with text that openNLP tokenizes differently than cTAKES - directly use openNLP api rather than static parseLine method.
Modified:
ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java
Modified: ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java?rev=1477043&r1=1477042&r2=1477043&view=diff
==============================================================================
--- ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java (original)
+++ ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/MaxentParserWrapper.java Mon Apr 29 13:15:19 2013
@@ -88,11 +88,12 @@ public class MaxentParserWrapper impleme
continue;
}
FSArray termArray = TreeUtils.getTerminals(jcas, sentAnnot);
+ Parse inputTokens = TreeUtils.ctakesTokensToOpennlpTokens(sentAnnot, termArray);
String sentStr = TreeUtils.getSentence(termArray);
if(sentStr.length() == 0){
parse = null;
}else{
- parse = ParserTool.parseLine(sentStr, parser, 1)[0];
+ parse = parser.parse(inputTokens);
}
TopTreebankNode top = TreeUtils.buildAlignedTree(jcas, parse, sentAnnot);
top.addToIndexes();
Modified: ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java?rev=1477043&r1=1477042&r2=1477043&view=diff
==============================================================================
--- ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java (original)
+++ ctakes/trunk/ctakes-constituency-parser/src/main/java/org/apache/ctakes/constituency/parser/util/TreeUtils.java Mon Apr 29 13:15:19 2013
@@ -18,8 +18,12 @@
*/
package org.apache.ctakes.constituency.parser.util;
-import opennlp.tools.parser.Parse;
+import java.util.ArrayList;
+import java.util.List;
+import opennlp.tools.parser.AbstractBottomUpParser;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.Span;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
@@ -36,11 +40,6 @@ import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.tcas.Annotation;
-import java.util.HashMap;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Map;
-
public class TreeUtils {
public static List<TreebankNode> getNodeList(TopTreebankNode tree){
@@ -151,12 +150,11 @@ public class TreeUtils {
return same;
}
- private static int getHighestIndexTerm(TreebankNode inTree) {
+ public static int getHighestIndexTerm(TreebankNode inTree) {
if(inTree instanceof TerminalTreebankNode){
return ((TerminalTreebankNode) inTree).getIndex();
- }else{
- return getHighestIndexTerm(inTree.getChildren(inTree.getChildren().size()-1));
}
+ return getHighestIndexTerm(inTree.getChildren(inTree.getChildren().size()-1));
}
public static TopTreebankNode getTopNode(TreebankNode inTree) {
@@ -174,7 +172,7 @@ public class TreeUtils {
FSArray termArray = TreeUtils.getTerminals(jcas, sent);
StringBuffer parseBuff = new StringBuffer();
- parse.show(parseBuff);
+ if(parse != null) parse.show(parseBuff);
TopTreebankNode top = new TopTreebankNode(jcas, sent.getBegin(), sent.getEnd());
top.setTreebankParse(parseBuff.toString());
@@ -228,15 +226,17 @@ public class TreeUtils {
public static String getSentence(FSArray termArray){
StringBuffer sent = new StringBuffer();
- int offset = 0;
+// int offset = 0;
for(int i = 0; i < termArray.size(); i++){
TerminalTreebankNode ttn = (TerminalTreebankNode) termArray.get(i);
String word = ttn.getNodeValue();
word = word.replaceAll("\\s", "");
- if(i == 0) offset = ttn.getBegin();
- else if(word.length() == 0) continue;
- else sent.append(" ");
+// if(i == 0) offset = ttn.getBegin();
+ /*else*/
+ if(word.length() == 0) continue;
+ //else
+ sent.append(" ");
sent.append(word);
}
@@ -304,5 +304,18 @@ public class TreeUtils {
}
}
}
+
+ public static Parse ctakesTokensToOpennlpTokens(Sentence sent, FSArray termArray) {
+ // based on the first part of parseLine in the opennlp libraries
+ String text = sent.getCoveredText();
+ Parse p = new Parse(sent.getCoveredText(), new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
+
+ for(int i = 0; i < termArray.size(); i++){
+ TerminalTreebankNode token = (TerminalTreebankNode) termArray.get(i);
+ p.insert(new Parse(text, new Span(token.getBegin()-sent.getBegin(), token.getEnd()-sent.getBegin()), AbstractBottomUpParser.TOK_NODE, 0, i));
+ }
+
+ return p;
+ }
}