You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/08/01 16:12:19 UTC
svn commit: r1368010 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/formats/ad/ test/java/opennlp/tools/formats/ad/
test/resources/opennlp/tools/formats/
Author: colen
Date: Wed Aug 1 14:12:19 2012
New Revision: 1368010
URL: http://svn.apache.org/viewvc?rev=1368010&view=rev
Log:
OPENNLP-529: AD formatter was not working with Amazonia corpus. Now we add a fake root node if there is multiple roots.
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java Wed Aug 1 14:12:19 2012
@@ -146,59 +146,78 @@ public class ADSentenceStream extends
sentence.setText(text);
sentence.setMetadata(meta);
// now we look for the root node
- line = reader.readLine();
+ // skip lines starting with ###
+ line = reader.readLine();
+ while(line != null && line.startsWith("###")) {
+ line = reader.readLine();
+ }
+
// got the root. Add it to the stack
Stack<Node> nodeStack = new Stack<Node>();
- // we get the complete line
- root.setSyntacticTag(line);
+ root.setSyntacticTag("ROOT");
root.setLevel(0);
nodeStack.add(root);
+
/* now we have to take care of the lastLevel. Every time it raises, we will add the
leaf to the node at the top. If it decreases, we remove the top. */
- line = reader.readLine();
+
while (line != null && line.length() != 0 && line.startsWith("</s>") == false && !line.equals("&&")) {
TreeElement element = this.getElement(line);
if(element != null) {
- // remove elements at same level or higher
- while (!nodeStack.isEmpty()
- && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
- nodeStack.pop();
+ // The idea here is to keep a stack of nodes that are candidates for
+ // parenting the following elements (nodes and leafs).
+
+ // 1) When we get a new element, we check its level and remove from
+ // the top of the stack nodes that are brothers or nephews.
+ while (!nodeStack.isEmpty() && element.getLevel() > 0
+ && element.getLevel() <= nodeStack.peek().getLevel()) {
+ Node nephew = nodeStack.pop();
}
+
if( element.isLeaf() ) {
+ // 2a) If the element is a leaf and there is no parent candidate,
+ // add it as a daughter of the root.
if (nodeStack.isEmpty()) {
root.addElement(element);
- } else {
- // look for the node with the correct level
- Node peek = nodeStack.peek();
- if (element.level == 0) { // add to the root
- nodeStack.firstElement().addElement(element);
- } else {
- Node parent = null;
- int index = nodeStack.size() - 1;
- while(parent == null) {
- if(peek.getLevel() < element.getLevel()) {
- parent = peek;
- } else {
- index--;
- if(index > -1) {
- peek = nodeStack.get(index);
- } else {
- parent = nodeStack.firstElement();
- }
- }
- }
- parent.addElement(element);
- }
+ } else {
+ // 2b) There are parent candidates.
+ // look for the node with the correct level
+ Node peek = nodeStack.peek();
+ if (element.level == 0) { // add to the root
+ nodeStack.firstElement().addElement(element);
+ } else {
+ Node parent = null;
+ int index = nodeStack.size() - 1;
+ while (parent == null) {
+ if (peek.getLevel() < element.getLevel()) {
+ parent = peek;
+ } else {
+ index--;
+ if (index > -1) {
+ peek = nodeStack.get(index);
+ } else {
+ parent = nodeStack.firstElement();
+ }
+ }
+ }
+ parent.addElement(element);
+ }
}
} else {
- if (!nodeStack.isEmpty()) {
- nodeStack.peek().addElement(element);
+ // 3) Check if the element that is at the top of the stack is this
+ // node parent, if yes add it as a son
+ if (!nodeStack.isEmpty() && nodeStack.peek().getLevel() < element.getLevel()) {
+ nodeStack.peek().addElement(element);
+ } else {
+ System.err.println("should not happen!");
}
+ // 4) Add it to the stack so it is a parent candidate.
nodeStack.push((Node) element);
+
}
}
line = reader.readLine();
@@ -228,10 +247,12 @@ public class ADSentenceStream extends
* @return the tree element
*/
public TreeElement getElement(String line) {
+ // Note: all levels are higher than 1, because 0 is reserved for the root.
+
// try node
Matcher nodeMatcher = nodePattern.matcher(line);
if (nodeMatcher.matches()) {
- int level = nodeMatcher.group(1).length();
+ int level = nodeMatcher.group(1).length() + 1;
String syntacticTag = nodeMatcher.group(2);
Node node = new Node();
node.setLevel(level);
@@ -241,7 +262,7 @@ public class ADSentenceStream extends
Matcher leafMatcher = leafPattern.matcher(line);
if (leafMatcher.matches()) {
- int level = leafMatcher.group(1).length();
+ int level = leafMatcher.group(1).length() + 1;
String syntacticTag = leafMatcher.group(2);
String funcTag = leafMatcher.group(3);
String lemma = leafMatcher.group(4);
@@ -262,7 +283,7 @@ public class ADSentenceStream extends
Matcher punctuationMatcher = punctuationPattern.matcher(line);
if (punctuationMatcher.matches()) {
- int level = punctuationMatcher.group(1).length();
+ int level = punctuationMatcher.group(1).length() + 1;
String lexeme = punctuationMatcher.group(2);
Leaf leaf = new Leaf();
leaf.setLevel(level);
@@ -278,7 +299,7 @@ public class ADSentenceStream extends
if(line.startsWith("=")) {
Matcher bizarreLeafMatcher = bizarreLeafPattern.matcher(line);
if (bizarreLeafMatcher.matches()) {
- int level = bizarreLeafMatcher.group(1).length();
+ int level = bizarreLeafMatcher.group(1).length() + 1;
String syntacticTag = bizarreLeafMatcher.group(2);
String lemma = bizarreLeafMatcher.group(3);
String morphologicalTag = bizarreLeafMatcher.group(4);
@@ -297,7 +318,7 @@ public class ADSentenceStream extends
return leaf;
} else {
- int level = line.lastIndexOf("=");
+ int level = line.lastIndexOf("=") + 1;
String lexeme = line.substring(level + 1);
if(lexeme.matches("\\w.*?[\\.<>].*")) {
@@ -316,7 +337,7 @@ public class ADSentenceStream extends
System.err.println("Couldn't parse leaf: " + line);
Leaf leaf = new Leaf();
- leaf.setLevel(0);
+ leaf.setLevel(1);
leaf.setSyntacticTag("");
leaf.setMorphologicalTag("");
leaf.setLexeme(line);
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java Wed Aug 1 14:12:19 2012
@@ -37,7 +37,7 @@ public class ADChunkSampleStreamTest {
@Test
public void testSimpleCount() throws IOException {
- assertEquals(6, samples.size());
+ assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
}
@Test
@@ -45,7 +45,7 @@ public class ADChunkSampleStreamTest {
assertEquals("Inicia", samples.get(0).getSentence()[0]);
assertEquals("v-fin", samples.get(0).getTags()[0]);
- assertEquals("B-NP", samples.get(0).getPreds()[2]);
+ assertEquals("B-VP", samples.get(0).getPreds()[0]);
assertEquals("em", samples.get(0).getSentence()[1]);
assertEquals("prp", samples.get(0).getTags()[1]);
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java Wed Aug 1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADNameSampleStreamTest {
@Test
public void testSimpleCount() throws IOException {
- assertEquals(6, samples.size());
+ assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
}
@Test
@@ -98,6 +98,18 @@ public class ADNameSampleStreamTest {
assertEquals(new Span(0, 3, "person"), samples.get(5).getNames()[0]);// 0..1
}
+
+ @Test
+ public void testSmallSentence() throws IOException {
+ assertEquals(2, samples.get(6).getSentence().length);
+ }
+
+ @Test
+ public void testMissingRightContraction() throws IOException {
+ assertEquals(new Span(0, 1, "person"), samples.get(7).getNames()[0]);
+ assertEquals(new Span(3, 4, "person"), samples.get(7).getNames()[1]);
+ assertEquals(new Span(5, 6, "person"), samples.get(7).getNames()[2]);
+ }
@Before
public void setup() throws IOException {
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java Wed Aug 1 14:12:19 2012
@@ -29,6 +29,8 @@ import org.junit.Test;
public class ADParagraphStreamTest {
+ public static final int NUM_SENTENCES = 8;
+
@Test
public void testSimpleReading() throws IOException {
int count = 0;
@@ -43,7 +45,7 @@ public class ADParagraphStreamTest {
// paragraph.getRoot();
}
- assertEquals(6, count);
+ assertEquals(ADParagraphStreamTest.NUM_SENTENCES, count);
}
@Test
@@ -59,7 +61,7 @@ public class ADParagraphStreamTest {
paragraph = stream.read();
}
- assertEquals(6, count);
+ assertEquals(ADParagraphStreamTest.NUM_SENTENCES, count);
}
private static ADSentenceStream openData() throws IOException {
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java Wed Aug 1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADSentenceSampleStreamTest
@Test
public void testSimpleCount() throws IOException {
- assertEquals(3, samples.size()); // means that there are 3 documents
+ assertEquals(5, samples.size());
}
@Test
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java Wed Aug 1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADTokenSampleStreamTest {
@Test
public void testSimpleCount() throws IOException {
- assertEquals(6, samples.size()); // means that there are 3 documents
+ assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
}
@Test
Modified: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
Binary files - no diff available.