You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2011/01/25 17:57:34 UTC
svn commit: r1063351 - in /incubator/opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/chunker/ChunkSample.java
test/java/opennlp/tools/chunker/ChunkSampleTest.java
Author: colen
Date: Tue Jan 25 16:57:34 2011
New Revision: 1063351
URL: http://svn.apache.org/viewvc?rev=1063351&view=rev
Log:
OPENNLP-85 Created a static method to create spans of phrase chunks and added javadoc
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java
incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java?rev=1063351&r1=1063350&r2=1063351&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSample.java Tue Jan 25 16:57:34 2011
@@ -24,13 +24,25 @@ import java.util.List;
import opennlp.tools.util.Span;
+/**
+ * Class for holding chunks for a single unit of text.
+ */
public class ChunkSample {
+
private final List<String> sentence;
-
private final List<String> tags;
-
private final List<String> preds;
-
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param sentence
+ * training sentence
+ * @param tags
+ * POS Tags for the sentence
+ * @param preds
+ * Chunk tags in B-* I-* notation
+ */
public ChunkSample(String[] sentence, String[] tags, String[] preds) {
if (sentence.length != tags.length || tags.length != preds.length)
@@ -41,56 +53,102 @@ public class ChunkSample {
this.preds = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(preds)));
}
+ /**
+ * Initializes the current instance.
+ *
+ * @param sentence
+ * training sentence
+ * @param tags
+ * POS Tags for the sentence
+ * @param preds
+ * Chunk tags in B-* I-* notation
+ */
public ChunkSample(List<String> sentence, List<String> tags, List<String> preds) {
- // TODO: Add validation of params ...
+ if (sentence.size() != tags.size() || tags.size() != preds.size() )
+ throw new IllegalArgumentException("All arrays must have the same length!");
+
this.sentence = Collections.unmodifiableList(new ArrayList<String>((sentence)));
this.tags = Collections.unmodifiableList(new ArrayList<String>((tags)));
this.preds = Collections.unmodifiableList(new ArrayList<String>((preds)));
}
-
+
+ /** Gets the training sentence */
public String[] getSentence() {
return sentence.toArray(new String[sentence.size()]);
}
-
+
+ /** Gets the POS Tags for the sentence */
public String[] getTags() {
return tags.toArray(new String[tags.size()]);
}
+ /** Gets the Chunk tags in B-* I-* notation */
public String[] getPreds() {
return preds.toArray(new String[preds.size()]);
}
+ /** Gets the phrases as an array of spans */
public Span[] getPhrasesAsSpanList() {
- List<Span> phrases = new ArrayList<Span>();
- String startTag = "";
- int startIndex = 0;
- boolean foundPhrase = false;
-
- for (int ci=0, cn = preds.size(); ci < cn; ci++) {
- String pred = preds.get(ci);
- if( pred.startsWith("B-") || ( !pred.equals("I-" + startTag) && !pred.equals("O") )) { // start
- if(foundPhrase) { // handle the last
- phrases.add(new Span(startIndex, ci, startTag));
- }
- startIndex = ci;
- startTag = pred.substring(2);
- foundPhrase = true;
- } else if(pred.equals("I-" + startTag)) { // middle
- // do nothing
- } else if(foundPhrase) {// end
- phrases.add(new Span(startIndex, ci, startTag));
- foundPhrase = false;
- startTag = "";
- }
- }
- if(foundPhrase) { // leftover
- phrases.add(new Span(startIndex, preds.size(), startTag));
- }
-
- return phrases.toArray(new Span[phrases.size()]);
+ return phrasesAsSpanList(getSentence(), getTags(), getPreds());
}
+ /**
+ * Static method to create arrays of spans of phrases
+ *
+ * @param aSentence
+ * training sentence
+ * @param aTags
+ * POS Tags for the sentence
+ * @param aPreds
+ * Chunk tags in B-* I-* notation
+ *
+ * @return the phrases as an array of spans
+ */
+ public static Span[] phrasesAsSpanList(String[] aSentence, String[] aTags,
+ String[] aPreds) {
+
+ if (aSentence.length != aTags.length || aTags.length != aPreds.length)
+ throw new IllegalArgumentException(
+ "All arrays must have the same length!");
+
+ List<Span> phrases = new ArrayList<Span>();
+ String startTag = "";
+ int startIndex = 0;
+ boolean foundPhrase = false;
+
+ for (int ci = 0, cn = aPreds.length; ci < cn; ci++) {
+ String pred = aPreds[ci];
+ if (pred.startsWith("B-")
+ || (!pred.equals("I-" + startTag) && !pred.equals("O"))) { // start
+ if (foundPhrase) { // handle the last
+ phrases.add(new Span(startIndex, ci, startTag));
+ }
+ startIndex = ci;
+ startTag = pred.substring(2);
+ foundPhrase = true;
+ } else if (pred.equals("I-" + startTag)) { // middle
+ // do nothing
+ } else if (foundPhrase) {// end
+ phrases.add(new Span(startIndex, ci, startTag));
+ foundPhrase = false;
+ startTag = "";
+ }
+ }
+ if (foundPhrase) { // leftover
+ phrases.add(new Span(startIndex, aPreds.length, startTag));
+ }
+
+ return phrases.toArray(new Span[phrases.size()]);
+ }
+ /**
+ * Creates a nice to read string for the phrases formatted as following: <br>
+ * <code>
+ * [NP Rockwell_NNP ] [VP said_VBD ] [NP the_DT agreement_NN ] [VP calls_VBZ ] [SBAR for_IN ] [NP it_PRP ] [VP to_TO supply_VB ] [NP 200_CD additional_JJ so-called_JJ shipsets_NNS ] [PP for_IN ] [NP the_DT planes_NNS ] ._.
+ * </code>
+ *
+ * @return a nice to read string representation of the chunk phases
+ */
public String nicePrint() {
Span[] spans = getPhrasesAsSpanList();
Modified: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java?rev=1063351&r1=1063350&r2=1063351&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkSampleTest.java Tue Jan 25 16:57:34 2011
@@ -130,6 +130,19 @@ public class ChunkSampleTest {
assertEquals(new Span(5, 6, "VP"), spans[3]);
assertEquals(new Span(6, 7, "ADVP"), spans[4]);
}
+
+ @Test
+ public void testPhraseAsSpan() {
+ Span[] spans = ChunkSample.phrasesAsSpanList(createSentence(),
+ createTags(), createChunks());
+
+ assertEquals(5, spans.length);
+ assertEquals(new Span(0, 1, "NP"), spans[0]);
+ assertEquals(new Span(1, 2, "PP"), spans[1]);
+ assertEquals(new Span(2, 5, "NP"), spans[2]);
+ assertEquals(new Span(5, 6, "VP"), spans[3]);
+ assertEquals(new Span(6, 7, "ADVP"), spans[4]);
+ }
@Test
public void testRegions() throws IOException {