You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2014/04/16 18:14:10 UTC
svn commit: r1587956 -
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
Author: colen
Date: Wed Apr 16 16:14:09 2014
New Revision: 1587956
URL: http://svn.apache.org/r1587956
Log:
OPENNLP-674 Use tokenizer from the factory
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java?rev=1587956&r1=1587955&r2=1587956&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java Wed Apr 16 16:14:09 2014
@@ -27,7 +27,6 @@ import opennlp.tools.cmdline.SystemInput
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -55,15 +54,11 @@ public class DoccatTool extends BasicCmd
DocumentCategorizerME doccat = new DocumentCategorizerME(model);
- //ObjectStream<String> documentStream = new ParagraphStream(
- // new PlainTextByLineStream(new InputStreamReader(System.in)));
/**
* moved initialization to the try block to catch new IOException
*/
ObjectStream<String> documentStream;
-
-
PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
perfMon.start();
@@ -72,10 +67,12 @@ public class DoccatTool extends BasicCmd
new PlainTextByLineStream(new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
String document;
while ((document = documentStream.read()) != null) {
- double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document));
+ String[] tokens = model.getFactory().getTokenizer().tokenize(document);
+
+ double prob[] = doccat.categorize(tokens);
String category = doccat.getBestCategory(prob);
- DocumentSample sample = new DocumentSample(category, document);
+ DocumentSample sample = new DocumentSample(category, tokens);
System.out.println(sample.toString());
perfMon.incrementCounter();