You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/08/01 16:12:19 UTC

svn commit: r1368010 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/formats/ad/ test/java/opennlp/tools/formats/ad/ test/resources/opennlp/tools/formats/

Author: colen
Date: Wed Aug  1 14:12:19 2012
New Revision: 1368010

URL: http://svn.apache.org/viewvc?rev=1368010&view=rev
Log:
OPENNLP-529: AD formatter was not working with Amazonia corpus. Now we add a fake root node if there is multiple roots.

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
    opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java Wed Aug  1 14:12:19 2012
@@ -146,59 +146,78 @@ public class ADSentenceStream extends
         sentence.setText(text);
         sentence.setMetadata(meta);
         // now we look for the root node
-        line = reader.readLine();
 
+        // skip lines starting with ###
+        line = reader.readLine();
+        while(line != null && line.startsWith("###")) {
+        	line = reader.readLine();
+        }
+        
         // got the root. Add it to the stack
         Stack<Node> nodeStack = new Stack<Node>();
-        // we get the complete line
 
-        root.setSyntacticTag(line);
+        root.setSyntacticTag("ROOT");
         root.setLevel(0);
         nodeStack.add(root);
         
+        
         /* now we have to take care of the lastLevel. Every time it raises, we will add the
         leaf to the node at the top. If it decreases, we remove the top. */
-        line = reader.readLine();
+        
         while (line != null && line.length() != 0 && line.startsWith("</s>") == false && !line.equals("&&")) {
           TreeElement element = this.getElement(line);
           
           if(element != null) {
-            // remove elements at same level or higher
-            while (!nodeStack.isEmpty()
-                && element.getLevel() > 0 && element.getLevel() <= nodeStack.peek().getLevel()) {
-              nodeStack.pop();
+            // The idea here is to keep a stack of nodes that are candidates for
+            // parenting the following elements (nodes and leafs).
+
+            // 1) When we get a new element, we check its level and remove from
+            // the top of the stack nodes that are brothers or nephews.
+            while (!nodeStack.isEmpty() && element.getLevel() > 0
+                && element.getLevel() <= nodeStack.peek().getLevel()) {
+              Node nephew = nodeStack.pop();
             }
+            
             if( element.isLeaf() ) {
+              // 2a) If the element is a leaf and there is no parent candidate,
+              // add it as a daughter of the root.  
               if (nodeStack.isEmpty()) {
                 root.addElement(element);
-  						} else {
-  							// look for the node with the correct level
-  							Node peek = nodeStack.peek();
-  							if (element.level == 0) { // add to the root
-  								nodeStack.firstElement().addElement(element);
-  							} else {
-  								Node parent = null;
-  								int index = nodeStack.size() - 1;
-  								while(parent == null) {
-  									if(peek.getLevel() < element.getLevel()) {
-  										parent = peek;
-  									} else {
-  										index--;
-  										if(index > -1) {
-  											peek = nodeStack.get(index);
-  										} else {
-  											parent = nodeStack.firstElement();
-  										}
-  									}
-  								}
-  								parent.addElement(element);
-  							}
+              } else {
+                // 2b) There are parent candidates. 
+                // look for the node with the correct level
+                Node peek = nodeStack.peek();
+                if (element.level == 0) { // add to the root
+                  nodeStack.firstElement().addElement(element);
+                } else {
+                  Node parent = null;
+                  int index = nodeStack.size() - 1;
+                  while (parent == null) {
+                    if (peek.getLevel() < element.getLevel()) {
+                      parent = peek;
+                    } else {
+                      index--;
+                      if (index > -1) {
+                        peek = nodeStack.get(index);
+                      } else {
+                        parent = nodeStack.firstElement();
+                      }
+                    }
+                  }
+                  parent.addElement(element);
+                }
               }
             } else {
-              if (!nodeStack.isEmpty()) {
-                nodeStack.peek().addElement(element);
+              // 3) Check if the element that is at the top of the stack is this
+              // node parent, if yes add it as a son 
+              if (!nodeStack.isEmpty() && nodeStack.peek().getLevel() < element.getLevel()) {
+                  nodeStack.peek().addElement(element);
+              } else {
+                System.err.println("should not happen!");
               }
+              // 4) Add it to the stack so it is a parent candidate.
               nodeStack.push((Node) element);
+              
             }
           }
           line = reader.readLine();
@@ -228,10 +247,12 @@ public class ADSentenceStream extends
      * @return the tree element
      */
     public TreeElement getElement(String line) {
+      // Note: all levels are higher than 1, because 0 is reserved for the root.
+      
       // try node
       Matcher nodeMatcher = nodePattern.matcher(line);
       if (nodeMatcher.matches()) {
-        int level = nodeMatcher.group(1).length();
+        int level = nodeMatcher.group(1).length() + 1;
         String syntacticTag = nodeMatcher.group(2);
         Node node = new Node();
         node.setLevel(level);
@@ -241,7 +262,7 @@ public class ADSentenceStream extends
 
       Matcher leafMatcher = leafPattern.matcher(line);
       if (leafMatcher.matches()) {
-        int level = leafMatcher.group(1).length();
+        int level = leafMatcher.group(1).length() + 1;
         String syntacticTag = leafMatcher.group(2);
         String funcTag = leafMatcher.group(3);
         String lemma = leafMatcher.group(4);
@@ -262,7 +283,7 @@ public class ADSentenceStream extends
 
       Matcher punctuationMatcher = punctuationPattern.matcher(line);
       if (punctuationMatcher.matches()) {
-        int level = punctuationMatcher.group(1).length();
+        int level = punctuationMatcher.group(1).length() + 1;
         String lexeme = punctuationMatcher.group(2);
         Leaf leaf = new Leaf();
         leaf.setLevel(level);
@@ -278,7 +299,7 @@ public class ADSentenceStream extends
       if(line.startsWith("=")) {
       	Matcher bizarreLeafMatcher = bizarreLeafPattern.matcher(line);
         if (bizarreLeafMatcher.matches()) {
-          int level = bizarreLeafMatcher.group(1).length();
+          int level = bizarreLeafMatcher.group(1).length() + 1;
           String syntacticTag = bizarreLeafMatcher.group(2);
           String lemma = bizarreLeafMatcher.group(3);
           String morphologicalTag = bizarreLeafMatcher.group(4);
@@ -297,7 +318,7 @@ public class ADSentenceStream extends
 
           return leaf;
         } else {
-        	int level = line.lastIndexOf("=");
+        	int level = line.lastIndexOf("=") + 1;
         	String lexeme = line.substring(level + 1);
         	
         	if(lexeme.matches("\\w.*?[\\.<>].*")) {
@@ -316,7 +337,7 @@ public class ADSentenceStream extends
       
       System.err.println("Couldn't parse leaf: " + line);
       Leaf leaf = new Leaf();
-      leaf.setLevel(0);
+      leaf.setLevel(1);
       leaf.setSyntacticTag("");
       leaf.setMorphologicalTag("");
       leaf.setLexeme(line);

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADChunkSampleStreamTest.java Wed Aug  1 14:12:19 2012
@@ -37,7 +37,7 @@ public class ADChunkSampleStreamTest {
 
   @Test
   public void testSimpleCount() throws IOException {
-    assertEquals(6, samples.size());
+    assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
   }
 
   @Test
@@ -45,7 +45,7 @@ public class ADChunkSampleStreamTest {
 
     assertEquals("Inicia", samples.get(0).getSentence()[0]);
     assertEquals("v-fin", samples.get(0).getTags()[0]);
-    assertEquals("B-NP", samples.get(0).getPreds()[2]);
+    assertEquals("B-VP", samples.get(0).getPreds()[0]);
 
     assertEquals("em", samples.get(0).getSentence()[1]);
     assertEquals("prp", samples.get(0).getTags()[1]);

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java Wed Aug  1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADNameSampleStreamTest {
 
   @Test
   public void testSimpleCount() throws IOException {
-    assertEquals(6, samples.size());
+    assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
   }
   
   @Test
@@ -98,6 +98,18 @@ public class ADNameSampleStreamTest {
     
     assertEquals(new Span(0, 3, "person"), samples.get(5).getNames()[0]);//    0..1
   }
+  
+  @Test
+  public void testSmallSentence() throws IOException {
+    assertEquals(2, samples.get(6).getSentence().length);
+  }
+  
+  @Test
+  public void testMissingRightContraction() throws IOException {
+    assertEquals(new Span(0, 1, "person"), samples.get(7).getNames()[0]);
+    assertEquals(new Span(3, 4, "person"), samples.get(7).getNames()[1]);
+    assertEquals(new Span(5, 6, "person"), samples.get(7).getNames()[2]);
+  }
 
   @Before
   public void setup() throws IOException {

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADParagraphStreamTest.java Wed Aug  1 14:12:19 2012
@@ -29,6 +29,8 @@ import org.junit.Test;
 
 public class ADParagraphStreamTest {
 
+  public static final int NUM_SENTENCES = 8;
+
   @Test
   public void testSimpleReading() throws IOException {
     int count = 0;
@@ -43,7 +45,7 @@ public class ADParagraphStreamTest {
 //      paragraph.getRoot();
     }
     
-    assertEquals(6, count);
+    assertEquals(ADParagraphStreamTest.NUM_SENTENCES, count);
   }
   
   @Test
@@ -59,7 +61,7 @@ public class ADParagraphStreamTest {
       paragraph = stream.read();
     }
     
-    assertEquals(6, count);
+    assertEquals(ADParagraphStreamTest.NUM_SENTENCES, count);
   }
   
   private static ADSentenceStream openData() throws IOException {

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADSentenceSampleStreamTest.java Wed Aug  1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADSentenceSampleStreamTest 
 
   @Test
   public void testSimpleCount() throws IOException {
-    assertEquals(3, samples.size()); // means that there are 3 documents
+    assertEquals(5, samples.size());
   }
 
   @Test

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java Wed Aug  1 14:12:19 2012
@@ -38,7 +38,7 @@ public class ADTokenSampleStreamTest {
 
   @Test
   public void testSimpleCount() throws IOException {
-    assertEquals(6, samples.size()); // means that there are 3 documents
+    assertEquals(ADParagraphStreamTest.NUM_SENTENCES, samples.size());
   }
 
   @Test

Modified: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample?rev=1368010&r1=1368009&r2=1368010&view=diff
==============================================================================
Binary files - no diff available.