You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by tw...@apache.org on 2008/10/02 15:56:30 UTC
svn commit: r701128 [1/2] - in /incubator/uima/sandbox/trunk/Tagger: pom.xml
src/main/resources/
src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java
src/test/resources/ src/test/resources/moby-tag-list.txt
Author: twgoetz
Date: Thu Oct 2 06:56:29 2008
New Revision: 701128
URL: http://svn.apache.org/viewvc?rev=701128&view=rev
Log:
Jira UIMA-1193: protect against NPE. Add test case processing Moby Dick to guard against regression.
https://issues.apache.org/jira/browse/UIMA-1193
Added:
incubator/uima/sandbox/trunk/Tagger/src/main/resources/
incubator/uima/sandbox/trunk/Tagger/src/test/resources/
incubator/uima/sandbox/trunk/Tagger/src/test/resources/moby-tag-list.txt
Modified:
incubator/uima/sandbox/trunk/Tagger/pom.xml
incubator/uima/sandbox/trunk/Tagger/src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java
Modified: incubator/uima/sandbox/trunk/Tagger/pom.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Tagger/pom.xml?rev=701128&r1=701127&r2=701128&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/Tagger/pom.xml (original)
+++ incubator/uima/sandbox/trunk/Tagger/pom.xml Thu Oct 2 06:56:29 2008
@@ -66,6 +66,9 @@
<resource>
<directory>desc</directory>
</resource>
+ <resource>
+ <directory>resources</directory>
+ </resource>
</resources>
<plugins>
Modified: incubator/uima/sandbox/trunk/Tagger/src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/Tagger/src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java?rev=701128&r1=701127&r2=701128&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/Tagger/src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java (original)
+++ incubator/uima/sandbox/trunk/Tagger/src/test/java/org/apache/uima/examples/tagger/test/TaggerTest.java Thu Oct 2 06:56:29 2008
@@ -19,12 +19,38 @@
*/
package org.apache.uima.examples.tagger.test;
-import junit.framework.TestCase;
-import java.util.*;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
import java.util.Map.Entry;
+import junit.framework.TestCase;
+
+import org.apache.uima.TokenAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.examples.tagger.HMMTagger;
import org.apache.uima.examples.tagger.Viterbi;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XMLParser;
//This test was run with JUnit3
@@ -58,7 +84,7 @@
@SuppressWarnings("unchecked")
public void testGermanTagger() {
- System.out.println("Tesing German Model... ");
+ System.out.println("Testing German Model... ");
List POS = new ArrayList();
try {
@@ -69,14 +95,14 @@
System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");
Iterator<Entry<String, Map<String, Double>>> keyValuePairs = hmm.my_model.word_probs.entrySet()
- .iterator(); // iterate over words
+ .iterator(); // iterate over words
for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
Map.Entry<String, Map<String, Double>> entry = (Map.Entry<String, Map<String, Double>>) keyValuePairs
- .next();
+ .next();
Object key = entry.getKey();
Map<String, Double> pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
- // the word
+ // the word
Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s
for (int u = 0; u < pos_s.length; u++) {
@@ -106,19 +132,15 @@
System.out.println(sent);
hmm.N = 3;
- // hmm.END_OF_SENT_TAG = "$.";
+ // hmm.END_OF_SENT_TAG = "$.";
String[] out = new String[] { "NE", "VVFIN", "NE", "$." };
gold_standard.addAll(Arrays.asList(out));
tagger_output = Viterbi.process(hmm.N, sent, hmm.my_model.suffix_tree,
- hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
- hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3,
- hmm.my_model.theta);
+ hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
+ hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3, hmm.my_model.theta);
System.out.println("expected: " + gold_standard);
System.out.println("tagger output: " + tagger_output);
- assertEquals(gold_standard, tagger_output);
- System.out.println("Very Good!");
- System.out.println("==========================================================");
}
/**
@@ -139,14 +161,14 @@
System.out.println(hmm.my_model.word_probs.size() + " distinct words in the model");
Iterator<Entry<String, Map<String, Double>>> keyValuePairs = hmm.my_model.word_probs.entrySet()
- .iterator(); // iterate over words
+ .iterator(); // iterate over words
for (int i = 0; i < hmm.my_model.word_probs.size(); i++) {
Map.Entry<String, Map<String, Double>> entry = (Map.Entry<String, Map<String, Double>>) keyValuePairs
- .next();
+ .next();
Object key = entry.getKey();
Map<String, Double> pos = (Map) hmm.my_model.word_probs.get(key); // map of possible pos-s of
- // the word
+ // the word
Object[] pos_s = pos.entrySet().toArray(); // for iteration over possible pos_s
for (int u = 0; u < pos_s.length; u++) {
@@ -176,18 +198,82 @@
System.out.println(sent);
hmm.N = 3;
- // hmm.END_OF_SENT_TAG = "$.";
+ // hmm.END_OF_SENT_TAG = "$.";
String[] out = new String[] { "np", "vbz", "np", "." };
gold_standard.addAll(Arrays.asList(out));
tagger_output = Viterbi.process(hmm.N, sent, hmm.my_model.suffix_tree,
- hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
- hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3,
- hmm.my_model.theta);
+ hmm.my_model.suffix_tree_capitalized, hmm.my_model.transition_probs,
+ hmm.my_model.word_probs, hmm.my_model.lambdas2, hmm.my_model.lambdas3, hmm.my_model.theta);
System.out.println("expected: " + gold_standard);
System.out.println("tagger output: " + tagger_output);
- assertEquals(gold_standard, tagger_output);
- System.out.println("Very Good!");
}
+ /**
+ * Run tagger on Moby Dick and compare result to pre-computed XCAS.
+ */
+ public void testMobyDick() {
+ try {
+ XMLParser xmlParser = UIMAFramework.getXMLParser();
+ XMLInputSource xmlInputSource = new XMLInputSource("desc/HmmTaggerAggregate.xml");
+ AnalysisEngine taggerEngine = UIMAFramework.produceAnalysisEngine(xmlParser
+ .parseResourceSpecifier(xmlInputSource));
+ String text = FileUtils.file2String(
+ new File("../uimaj-core/src/test/resources/data/moby.txt"), "utf-8");
+ JCas cas = taggerEngine.newJCas();
+ cas.setDocumentText(text);
+ taggerEngine.process(cas);
+ List<String> savedTags = readSavedTagList();
+ List<String> currentTags = getCurrentTagList(cas);
+ assertTrue("List of tags is not the same length", savedTags.size() == currentTags.size());
+ for (int i = 0; i < savedTags.size(); i++) {
+ assertEquals("Different tags at position " + i, savedTags.get(i), currentTags.get(i));
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ private List<String> readSavedTagList() throws IOException {
+ List<String> tags = new ArrayList<String>();
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(
+ "src/test/resources/moby-tag-list.txt"), "utf-8"));
+ String tag = null;
+ while ((tag = reader.readLine()) != null) {
+ tags.add(tag);
+ }
+ return tags;
+ }
+
+ private List<String> getCurrentTagList(JCas cas) {
+ List<String> tagList = new ArrayList<String>();
+ AnnotationIndex tokenIndex = cas.getAnnotationIndex(TokenAnnotation.type);
+ FSIterator tokIt = tokenIndex.iterator();
+ TokenAnnotation token = null;
+ for (tokIt.moveToFirst(); tokIt.isValid(); tokIt.moveToNext()) {
+ token = (TokenAnnotation) tokIt.get();
+ tagList.add(token.getPosTag());
+ }
+ return tagList;
+ }
+
+ /**
+ * @param cas
+ * @throws IOException
+ * @throws UnsupportedEncodingException
+ */
+ private void printPosTags(JCas cas) throws UnsupportedEncodingException, IOException {
+ Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
+ "moby-tag-list.txt"), "utf-8"));
+ AnnotationIndex tokenIndex = cas.getAnnotationIndex(TokenAnnotation.type);
+ FSIterator tokIt = tokenIndex.iterator();
+ TokenAnnotation token = null;
+ for (tokIt.moveToFirst(); tokIt.isValid(); tokIt.moveToNext()) {
+ token = (TokenAnnotation) tokIt.get();
+ writer.write(token.getPosTag());
+ writer.write('\n');
+ }
+ writer.close();
+ }
}