You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/31 10:18:54 UTC

svn commit: r1129557 - in /incubator/opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/postag/POSTaggerME.java test/java/opennlp/tools/postag/POSTaggerMETest.java

Author: joern
Date: Tue May 31 08:18:54 2011
New Revision: 1129557

URL: http://svn.apache.org/viewvc?rev=1129557&view=rev
Log:
OPENNLP-187 Added util method to build ngram dictionary

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
    incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java?rev=1129557&r1=1129556&r2=1129557&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java Tue May 31 08:18:54 2011
@@ -30,10 +30,12 @@ import opennlp.model.AbstractModel;
 import opennlp.model.EventStream;
 import opennlp.model.TrainUtil;
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.ngram.NGramModel;
 import opennlp.tools.util.BeamSearch;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.StringList;
 import opennlp.tools.util.TrainingParameters;
 import opennlp.tools.util.model.ModelType;
 
@@ -343,4 +345,22 @@ public class POSTaggerME implements POST
     
     return train(languageCode, samples, params, tagDictionary, ngramDictionary);
   }
+  
+  public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
+      throws IOException {
+    
+    NGramModel ngramModel = new NGramModel();
+    
+    POSSample sample;
+    while((sample = samples.read()) != null) {
+      String[] words = sample.getSentence();
+      
+      if (words.length > 0)
+        ngramModel.add(new StringList(words), 1, 1);
+    }
+    
+    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);
+    
+    return ngramModel.toDictionary(true);
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java?rev=1129557&r1=1129556&r2=1129557&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java Tue May 31 08:18:54 2011
@@ -24,6 +24,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 
+import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.model.ModelType;
 
 import org.junit.Test;
@@ -33,19 +34,22 @@ import org.junit.Test;
  */
 public class POSTaggerMETest {
 
+  private static ObjectStream<POSSample> createSampleStream() throws IOException {
+    InputStream in = POSTaggerMETest.class.getClassLoader().getResourceAsStream(
+        "opennlp/tools/postag/AnnotatedSentences.txt");
+    
+    return new WordTagSampleStream((new InputStreamReader(in)));
+  }
+  
   /**
    * Trains a POSModel from the annotated test data.
    *
    * @return
    * @throws IOException
    */
-  // TODO: also use tag dictionary for training
   static POSModel trainPOSModel(ModelType type) throws IOException {
-    InputStream in = POSTaggerMETest.class.getClassLoader().getResourceAsStream(
-        "opennlp/tools/postag/AnnotatedSentences.txt");
-
-    return POSTaggerME.train("en", new WordTagSampleStream((
-        new InputStreamReader(in))), type, null, null, 5, 100);
+    // TODO: also use tag dictionary for training
+    return POSTaggerME.train("en", createSampleStream(), type, null, null, 5, 100);
   }
 
   @Test
@@ -71,4 +75,11 @@ public class POSTaggerMETest {
     assertEquals("VBN", tags[4]);
     assertEquals(".", tags[5]);
   }
+  
+  @Test
+  public void testBuildNGramDictionary() throws IOException {
+    ObjectStream<POSSample> samples = createSampleStream();
+    
+    POSTaggerME.buildNGramDictionary(samples, 0);
+  }
 }
\ No newline at end of file