You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/31 10:18:54 UTC
svn commit: r1129557 - in /incubator/opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/postag/POSTaggerME.java
test/java/opennlp/tools/postag/POSTaggerMETest.java
Author: joern
Date: Tue May 31 08:18:54 2011
New Revision: 1129557
URL: http://svn.apache.org/viewvc?rev=1129557&view=rev
Log:
OPENNLP-187 Added util method to build ngram dictionary
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1129557&r1=1129556&r2=1129557&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Tue May 31 08:18:54 2011
@@ -30,10 +30,12 @@ import opennlp.model.AbstractModel;
import opennlp.model.EventStream;
import opennlp.model.TrainUtil;
import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.StringList;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelType;
@@ -343,4 +345,22 @@ public class POSTaggerME implements POST
return train(languageCode, samples, params, tagDictionary, ngramDictionary);
}
+
+ public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
+ throws IOException {
+
+ NGramModel ngramModel = new NGramModel();
+
+ POSSample sample;
+ while((sample = samples.read()) != null) {
+ String[] words = sample.getSentence();
+
+ if (words.length > 0)
+ ngramModel.add(new StringList(words), 1, 1);
+ }
+
+ ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
+
+ return ngramModel.toDictionary(true);
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java?rev=1129557&r1=1129556&r2=1129557&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java Tue May 31 08:18:54 2011
@@ -24,6 +24,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.model.ModelType;
import org.junit.Test;
@@ -33,19 +34,22 @@ import org.junit.Test;
*/
public class POSTaggerMETest {
+ private static ObjectStream<POSSample> createSampleStream() throws IOException {
+ InputStream in = POSTaggerMETest.class.getClassLoader().getResourceAsStream(
+ "opennlp/tools/postag/AnnotatedSentences.txt");
+
+ return new WordTagSampleStream((new InputStreamReader(in)));
+ }
+
/**
* Trains a POSModel from the annotated test data.
*
* @return
* @throws IOException
*/
- // TODO: also use tag dictionary for training
static POSModel trainPOSModel(ModelType type) throws IOException {
- InputStream in = POSTaggerMETest.class.getClassLoader().getResourceAsStream(
- "opennlp/tools/postag/AnnotatedSentences.txt");
-
- return POSTaggerME.train("en", new WordTagSampleStream((
- new InputStreamReader(in))), type, null, null, 5, 100);
+ // TODO: also use tag dictionary for training
+ return POSTaggerME.train("en", createSampleStream(), type, null, null, 5, 100);
}
@Test
@@ -71,4 +75,11 @@ public class POSTaggerMETest {
assertEquals("VBN", tags[4]);
assertEquals(".", tags[5]);
}
+
+ @Test
+ public void testBuildNGramDictionary() throws IOException {
+ ObjectStream<POSSample> samples = createSampleStream();
+
+ POSTaggerME.buildNGramDictionary(samples, 0);
+ }
}
\ No newline at end of file