You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@opennlp.apache.org by co...@apache.org on 2014/04/16 18:14:10 UTC

svn commit: r1587956 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java

Author: colen
Date: Wed Apr 16 16:14:09 2014
New Revision: 1587956

URL: http://svn.apache.org/r1587956
Log:
OPENNLP-674 Use tokenizer from the factory

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java?rev=1587956&r1=1587955&r2=1587956&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java Wed Apr 16 16:14:09 2014
@@ -27,7 +27,6 @@ import opennlp.tools.cmdline.SystemInput
 import opennlp.tools.doccat.DoccatModel;
 import opennlp.tools.doccat.DocumentCategorizerME;
 import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ParagraphStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -55,15 +54,11 @@ public class DoccatTool extends BasicCmd
 
       DocumentCategorizerME doccat = new DocumentCategorizerME(model);
 
-      //ObjectStream<String> documentStream = new ParagraphStream(
-            //  new PlainTextByLineStream(new InputStreamReader(System.in)));
       /**
        * moved initialization to the try block to catch new IOException
        */
       ObjectStream<String> documentStream;
 
-
-
       PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
       perfMon.start();
 
@@ -72,10 +67,12 @@ public class DoccatTool extends BasicCmd
                 new PlainTextByLineStream(new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
         String document;
         while ((document = documentStream.read()) != null) {
-          double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document));
+          String[] tokens = model.getFactory().getTokenizer().tokenize(document);
+
+          double prob[] = doccat.categorize(tokens);
           String category = doccat.getBestCategory(prob);
 
-          DocumentSample sample = new DocumentSample(category, document);
+          DocumentSample sample = new DocumentSample(category, tokens);
           System.out.println(sample.toString());
 
           perfMon.incrementCounter();