You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/01/25 17:57:34 UTC

svn commit: r1063351 - in /incubator/opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/chunker/ChunkSample.java test/java/opennlp/tools/chunker/ChunkSampleTest.java

Author: colen
Date: Tue Jan 25 16:57:34 2011
New Revision: 1063351

URL: http://svn.apache.org/viewvc?rev=1063351&view=rev
Log:
OPENNLP-85 Created a static method to create spans of phrase chunks and added javadoc

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java
    incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java?rev=1063351&r1=1063350&r2=1063351&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java Tue Jan 25 16:57:34 2011
@@ -24,13 +24,25 @@ import java.util.List;
 
 import opennlp.tools.util.Span;
 
+/**
+ * Class for holding chunks for a single unit of text.
+ */
 public class ChunkSample {
+	
   private final List<String> sentence;
-
   private final List<String> tags;
-  
   private final List<String> preds;
-  
+
+  /**
+   * Initializes the current instance.
+   * 
+   * @param sentence
+   *          training sentence
+   * @param tags
+   *          POS Tags for the sentence
+   * @param preds
+   *          Chunk tags in B-* I-* notation
+   */
   public ChunkSample(String[] sentence, String[] tags, String[] preds) {
     
     if (sentence.length != tags.length || tags.length != preds.length)
@@ -41,56 +53,102 @@ public class ChunkSample {
     this.preds = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(preds)));
   }
 
+  /**
+   * Initializes the current instance.
+   * 
+   * @param sentence
+   *          training sentence
+   * @param tags
+   *          POS Tags for the sentence
+   * @param preds
+   *          Chunk tags in B-* I-* notation
+   */
   public ChunkSample(List<String> sentence, List<String> tags, List<String> preds) {
-    // TODO: Add validation of params ...
+  	if (sentence.size() != tags.size()  || tags.size()  != preds.size() )
+      throw new IllegalArgumentException("All arrays must have the same length!");
+  	
     this.sentence = Collections.unmodifiableList(new ArrayList<String>((sentence)));
     this.tags = Collections.unmodifiableList(new ArrayList<String>((tags)));
     this.preds = Collections.unmodifiableList(new ArrayList<String>((preds)));
   }
-  
+ 
+  /** Gets the training sentence */
   public String[] getSentence() {
     return sentence.toArray(new String[sentence.size()]);
   }
-  
+
+  /** Gets the POS Tags for the sentence */
   public String[] getTags() {
     return tags.toArray(new String[tags.size()]);
   }
   
+  /** Gets the Chunk tags in B-* I-* notation */
   public String[] getPreds() {
     return preds.toArray(new String[preds.size()]);
   }
   
+  /** Gets the phrases as an array of spans */
   public Span[] getPhrasesAsSpanList() {
-	  List<Span> phrases =  new ArrayList<Span>();
-	  String startTag = "";
-	  int startIndex = 0;
-	  boolean foundPhrase = false;
-	    
-	    for (int ci=0, cn = preds.size(); ci < cn; ci++) {
-	    	String pred = preds.get(ci);
-	    	if( pred.startsWith("B-") || ( !pred.equals("I-" + startTag) && !pred.equals("O") )) { // start
-	    		if(foundPhrase) { // handle the last
-	    			phrases.add(new Span(startIndex, ci, startTag));
-	    		}
-	    		startIndex = ci;
-	    		startTag = pred.substring(2);
-	    		foundPhrase = true;
-	    	} else if(pred.equals("I-" + startTag)) { // middle 
-	    		// do nothing
-	    	} else if(foundPhrase) {// end
-	    		phrases.add(new Span(startIndex, ci, startTag));
-	    		foundPhrase = false;
-	    		startTag = "";
-	    	}
-	    }
-	    if(foundPhrase) { // leftover
-	    	phrases.add(new Span(startIndex, preds.size(), startTag));
-	    }
-	    
-	    return phrases.toArray(new Span[phrases.size()]);
+    return phrasesAsSpanList(getSentence(), getTags(), getPreds());
   }
   
+  /**
+   * Static method to create arrays of spans of phrases
+   * 
+   * @param aSentence
+   *          training sentence
+   * @param aTags
+   *          POS Tags for the sentence
+   * @param aPreds
+   *          Chunk tags in B-* I-* notation
+   * 
+   * @return the phrases as an array of spans
+   */
+  public static Span[] phrasesAsSpanList(String[] aSentence, String[] aTags,
+      String[] aPreds) {
+
+    if (aSentence.length != aTags.length || aTags.length != aPreds.length)
+      throw new IllegalArgumentException(
+          "All arrays must have the same length!");
+
+    List<Span> phrases = new ArrayList<Span>();
+    String startTag = "";
+    int startIndex = 0;
+    boolean foundPhrase = false;
+
+    for (int ci = 0, cn = aPreds.length; ci < cn; ci++) {
+      String pred = aPreds[ci];
+      if (pred.startsWith("B-")
+          || (!pred.equals("I-" + startTag) && !pred.equals("O"))) { // start
+        if (foundPhrase) { // handle the last
+          phrases.add(new Span(startIndex, ci, startTag));
+        }
+        startIndex = ci;
+        startTag = pred.substring(2);
+        foundPhrase = true;
+      } else if (pred.equals("I-" + startTag)) { // middle
+        // do nothing
+      } else if (foundPhrase) {// end
+        phrases.add(new Span(startIndex, ci, startTag));
+        foundPhrase = false;
+        startTag = "";
+      }
+    }
+    if (foundPhrase) { // leftover
+      phrases.add(new Span(startIndex, aPreds.length, startTag));
+    }
+
+    return phrases.toArray(new Span[phrases.size()]);
+  }
   
+  /**
+   * Creates a nice to read string for the phrases formatted as following: <br>
+   * <code>
+   * [NP Rockwell_NNP ] [VP said_VBD ] [NP the_DT agreement_NN ] [VP calls_VBZ ] [SBAR for_IN ] [NP it_PRP ] [VP to_TO supply_VB ] [NP 200_CD additional_JJ so-called_JJ shipsets_NNS ] [PP for_IN ] [NP the_DT planes_NNS ] ._.
+   * </code>
+   * 
+   * @return a nice to read string representation of the chunk phases
+   */
   public String nicePrint() {
   	
   	Span[] spans = getPhrasesAsSpanList();

Modified: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java?rev=1063351&r1=1063350&r2=1063351&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java Tue Jan 25 16:57:34 2011
@@ -130,6 +130,19 @@ public class ChunkSampleTest {
 	assertEquals(new Span(5, 6, "VP"), spans[3]);
 	assertEquals(new Span(6, 7, "ADVP"), spans[4]);
   }
+  
+	@Test
+	public void testPhraseAsSpan() {
+		Span[] spans = ChunkSample.phrasesAsSpanList(createSentence(),
+				createTags(), createChunks());
+
+		assertEquals(5, spans.length);
+		assertEquals(new Span(0, 1, "NP"), spans[0]);
+		assertEquals(new Span(1, 2, "PP"), spans[1]);
+		assertEquals(new Span(2, 5, "NP"), spans[2]);
+		assertEquals(new Span(5, 6, "VP"), spans[3]);
+		assertEquals(new Span(6, 7, "ADVP"), spans[4]);
+	}
 
   @Test
   public void testRegions() throws IOException {