You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/01/07 19:14:20 UTC
svn commit: r1056435 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java
Author: colen
Date: Fri Jan 7 18:14:20 2011
New Revision: 1056435
URL: http://svn.apache.org/viewvc?rev=1056435&view=rev
Log:
OPENNLP-60 Improvements to ADParagraphStream
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java?rev=1056435&r1=1056434&r2=1056435&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ADParagraphStream.java Fri Jan 7 18:14:20 2011
@@ -80,6 +80,8 @@ public class ADParagraphStream extends
.compile("^([=-]*)([^:=]+:[^\\(\\s]+)(\\(([^\\)]+)\\))?\\s*$");
private Pattern leafPattern = Pattern
.compile("^([=-]*)([^:=]+:[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
+ private Pattern bizarreLeafPattern = Pattern
+ .compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
private Pattern punctuationPattern = Pattern.compile("^(=*)(\\W+)$");
/**
@@ -129,45 +131,46 @@ public class ADParagraphStream extends
//line = reader.readLine();
while (line.length() != 0 && line.startsWith("</s>") == false) {
TreeElement element = this.getElement(line);
-
- // remove elements at same level or higher
- while (!nodeStack.isEmpty()
- && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
- nodeStack.pop();
- }
- if( element.isLeaf() ) {
- if (nodeStack.isEmpty()) {
- root.addElement(element);
- } else {
- // look for the node with the correct level
- Node peek = nodeStack.peek();
- if (element.level == 0) { // add to the root
- nodeStack.firstElement().addElement(element);
- } else {
- Node parent = null;
- int index = nodeStack.size() - 1;
- while(parent == null) {
- if(peek.getLevel() < element.getLevel()) {
- parent = peek;
- } else {
- index--;
- if(index > -1) {
- peek = nodeStack.get(index);
- } else {
- parent = nodeStack.firstElement();
- }
- }
- }
- parent.addElement(element);
- }
+
+ if(element != null) {
+ // remove elements at same level or higher
+ while (!nodeStack.isEmpty()
+ && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
+ nodeStack.pop();
}
- } else {
- if (!nodeStack.isEmpty()) {
- nodeStack.peek().addElement(element);
+ if( element.isLeaf() ) {
+ if (nodeStack.isEmpty()) {
+ root.addElement(element);
+ } else {
+ // look for the node with the correct level
+ Node peek = nodeStack.peek();
+ if (element.level == 0) { // add to the root
+ nodeStack.firstElement().addElement(element);
+ } else {
+ Node parent = null;
+ int index = nodeStack.size() - 1;
+ while(parent == null) {
+ if(peek.getLevel() < element.getLevel()) {
+ parent = peek;
+ } else {
+ index--;
+ if(index > -1) {
+ peek = nodeStack.get(index);
+ } else {
+ parent = nodeStack.firstElement();
+ }
+ }
+ }
+ parent.addElement(element);
+ }
+ }
+ } else {
+ if (!nodeStack.isEmpty()) {
+ nodeStack.peek().addElement(element);
+ }
+ nodeStack.push((Node) element);
}
- nodeStack.push((Node) element);
}
-
line = reader.readLine();
}
@@ -234,6 +237,46 @@ public class ADParagraphStream extends
return leaf;
}
+ // process the bizarre cases
+ if(line.equals("_") || line.startsWith("<lixo") || line.startsWith("pause")) {
+ return null;
+ }
+
+ if(line.startsWith("=")) {
+ Matcher bizarreLeafMatcher = bizarreLeafPattern.matcher(line);
+ if (bizarreLeafMatcher.matches()) {
+ int level = bizarreLeafMatcher.group(1).length();
+ String syntacticTag = bizarreLeafMatcher.group(2);
+ String lemma = bizarreLeafMatcher.group(3);
+ String morphologicalTag = bizarreLeafMatcher.group(4);
+ String lexeme = bizarreLeafMatcher.group(5);
+ Leaf leaf = new Leaf();
+ leaf.setLevel(level);
+ leaf.setSyntacticTag(syntacticTag);
+ leaf.setMorphologicalTag(morphologicalTag);
+ leaf.setLexeme(lexeme);
+ if (lemma != null) {
+ if (lemma.length() > 2) {
+ lemma = lemma.substring(1, lemma.length() - 1);
+ }
+ leaf.setLemma(lemma);
+ }
+
+ return leaf;
+ } else {
+ int level = line.lastIndexOf("=");
+ String lexeme = line.substring(level + 1);
+
+ Leaf leaf = new Leaf();
+ leaf.setLevel(level + 1);
+ leaf.setSyntacticTag("");
+ leaf.setMorphologicalTag("");
+ leaf.setLexeme(lexeme);
+
+ return leaf;
+ }
+ }
+
System.err.println("Couldn't parse leaf: " + line);
Leaf leaf = new Leaf();
leaf.setLevel(0);