You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2016/11/02 18:26:34 UTC
opennlp git commit: Replace StringTokenizer with OpenNLP Tokenizer
Repository: opennlp
Updated Branches:
refs/heads/trunk 92bc7f05a -> f1cbfeab3
Replace StringTokenizer with OpenNLP Tokenizer
The StringTokenizer was used to perform white space tokenization
long before the WhitespaceTokenizer became a part of OpenNLP.
This change also allows to pass in some tokenizer to make it
easier to tokenize an input sentence without using pipes.
See issue OPENNLP-857 for more details.
Thanks to Tristan Nixon for providing a patch!
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f1cbfeab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f1cbfeab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f1cbfeab
Branch: refs/heads/trunk
Commit: f1cbfeab32df7fa41945204568e01fb2d4c4a4b8
Parents: 92bc7f0
Author: J�rn Kottmann <jo...@apache.org>
Authored: Wed Nov 2 19:22:24 2016 +0100
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Wed Nov 2 19:22:24 2016 +0100
----------------------------------------------------------------------
.../tools/cmdline/parser/ParserTool.java | 43 +++++++++++++-------
1 file changed, 29 insertions(+), 14 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/f1cbfeab/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
index 0bd9ffd..dddaf94 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
@@ -18,10 +18,9 @@ package opennlp.tools.cmdline.parser;
import java.io.File;
import java.io.IOException;
-import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
-import java.util.StringTokenizer;
import java.util.regex.Pattern;
import opennlp.tools.cmdline.BasicCmdLineTool;
@@ -29,10 +28,16 @@ import opennlp.tools.cmdline.CLI;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.PerformanceMonitor;
import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader;
import opennlp.tools.parser.AbstractBottomUpParser;
import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
@@ -44,26 +49,31 @@ public final class ParserTool extends BasicCmdLineTool {
}
public String getHelp() {
- return "Usage: " + CLI.CMD + " " + getName() + " [-bs n -ap n -k n] model < sentences \n"
+ return "Usage: " + CLI.CMD + " " + getName() + " [-bs n -ap n -k n -tk tok_model] model < sentences \n"
+ "-bs n: Use a beam size of n.\n"
+ "-ap f: Advance outcomes in with at least f% of the probability mass.\n"
- + "-k n: Show the top n parses. This will also display their log-probablities.";
+ + "-k n: Show the top n parses. This will also display their log-probablities.\n"
+ + "-tk tok_model: Use the specified tokenizer model to tokenize the sentences. Defaults to a WhitespaceTokenizer.";
}
private static Pattern untokenizedParenPattern1 = Pattern.compile("([^ ])([({)}])");
private static Pattern untokenizedParenPattern2 = Pattern.compile("([({)}])([^ ])");
- public static Parse[] parseLine(String line, opennlp.tools.parser.Parser parser, int numParses) {
+ public static Parse[] parseLine(String line, Parser parser, int numParses) {
+ return parseLine( line, parser, WhitespaceTokenizer.INSTANCE, numParses );
+ }
+
+ public static Parse[] parseLine(String line, Parser parser, Tokenizer tokenizer, int numParses) {
+ // fix some parens patterns
line = untokenizedParenPattern1.matcher(line).replaceAll("$1 $2");
line = untokenizedParenPattern2.matcher(line).replaceAll("$1 $2");
- StringTokenizer str = new StringTokenizer(line);
+
+ // tokenize
+ List<String> tokens = Arrays.asList( tokenizer.tokenize(line));
StringBuilder sb = new StringBuilder();
- List<String> tokens = new ArrayList<String>();
- while (str.hasMoreTokens()) {
- String tok = str.nextToken();
- tokens.add(tok);
+ for (String tok : tokens) {
sb.append(tok).append(" ");
}
- String text = sb.substring(0, sb.length() - 1);
+ String text = sb.substring(0, sb.length());
Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
int start = 0;
int i = 0;
@@ -109,9 +119,14 @@ public final class ParserTool extends BasicCmdLineTool {
advancePercentage = AbstractBottomUpParser.defaultAdvancePercentage;
}
- opennlp.tools.parser.Parser parser =
- ParserFactory.create(model, beamSize, advancePercentage);
+ Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ String tokenizerModelName = CmdLineUtil.getParameter( "-tk", args );
+ if( tokenizerModelName != null ){
+ TokenizerModel tokenizerModel = new TokenizerModelLoader().load( new File( tokenizerModelName ) );
+ tokenizer = new TokenizerME( tokenizerModel );
+ }
+ Parser parser = ParserFactory.create(model, beamSize, advancePercentage);
ObjectStream<String> lineStream = null;
PerformanceMonitor perfMon = null;
@@ -124,7 +139,7 @@ public final class ParserTool extends BasicCmdLineTool {
if (line.trim().length() == 0) {
System.out.println();
} else {
- Parse[] parses = parseLine(line, parser, numParses);
+ Parse[] parses = parseLine(line, parser, tokenizer, numParses);
for (int pi = 0, pn = parses.length; pi < pn; pi++) {
if (showTopK) {