You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/12/01 12:47:01 UTC
svn commit: r1209044 -
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java
Author: joern
Date: Thu Dec 1 11:47:01 2011
New Revision: 1209044
URL: http://svn.apache.org/viewvc?rev=1209044&view=rev
Log:
OPENNLP-407 Restored name finder command line tool whcih can process parse trees.
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java (with props)
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java?rev=1209044&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java Thu Dec 1 11:47:01 2011
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.lang.english;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+import opennlp.model.AbstractModel;
+import opennlp.maxent.io.PooledGISModelReader;
+import opennlp.tools.namefind.NameFinderEventStream;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.Span;
+
+/**
+ * Class is used to create a name finder for English.
+ */
+public class TreebankNameFinder {
+
+ public static String[] NAME_TYPES = {"person", "organization", "location", "date", "time", "percentage", "money"};
+
+ private NameFinderME nameFinder;
+
+ /** Creates an English name finder using the specified model.
+ * @param mod The model used for finding names.
+ */
+ public TreebankNameFinder(TokenNameFinderModel mod) {
+ nameFinder = new NameFinderME(mod);
+ }
+
+ private static void addNames(String tag, Span[] names, Parse[] tokens) {
+ for (int ni=0,nn=names.length;ni<nn;ni++) {
+ Span nameTokenSpan = names[ni];
+ Parse startToken = tokens[nameTokenSpan.getStart()];
+ Parse endToken = tokens[nameTokenSpan.getEnd()-1];
+ Parse commonParent = startToken.getCommonParent(endToken);
+ //System.err.println("addNames: "+startToken+" .. "+endToken+" commonParent = "+commonParent);
+ if (commonParent != null) {
+ Span nameSpan = new Span(startToken.getSpan().getStart(),endToken.getSpan().getEnd());
+ if (nameSpan.equals(commonParent.getSpan())) {
+ commonParent.insert(new Parse(commonParent.getText(),nameSpan,tag,1.0,endToken.getHeadIndex()));
+ }
+ else {
+ Parse[] kids = commonParent.getChildren();
+ boolean crossingKids = false;
+ for (int ki=0,kn=kids.length;ki<kn;ki++) {
+ if (nameSpan.crosses(kids[ki].getSpan())){
+ crossingKids = true;
+ }
+ }
+ if (!crossingKids) {
+ commonParent.insert(new Parse(commonParent.getText(),nameSpan,tag,1.0,endToken.getHeadIndex()));
+ }
+ else {
+ if (commonParent.getType().equals("NP")) {
+ Parse[] grandKids = kids[0].getChildren();
+ if (grandKids.length > 1 && nameSpan.contains(grandKids[grandKids.length-1].getSpan())) {
+ commonParent.insert(new Parse(commonParent.getText(),commonParent.getSpan(),tag,1.0,commonParent.getHeadIndex()));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private static void clearPrevTokenMaps(TreebankNameFinder[] finders) {
+ for (int mi = 0; mi < finders.length; mi++) {
+ finders[mi].nameFinder.clearAdaptiveData();
+ }
+ }
+
+ private static void processParse(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
+ Span[][] nameSpans = new Span[finders.length][];
+
+ for (String line = input.readLine(); null != line; line = input.readLine()) {
+ if (line.equals("")) {
+ System.out.println();
+ clearPrevTokenMaps(finders);
+ continue;
+ }
+ Parse p = Parse.parseParse(line);
+ Parse[] tagNodes = p.getTagNodes();
+ String[] tokens = new String[tagNodes.length];
+ for (int ti=0;ti<tagNodes.length;ti++){
+ tokens[ti] = tagNodes[ti].toString();
+ }
+ //System.err.println(java.util.Arrays.asList(tokens));
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ nameSpans[fi] = finders[fi].nameFinder.find(tokens);
+ //System.err.println("english.NameFinder.processParse: "+tags[fi] + " " + java.util.Arrays.asList(nameSpans[fi]));
+ }
+
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ addNames(tags[fi],nameSpans[fi],tagNodes);
+ }
+ p.show();
+ }
+ }
+
+ /**
+ * Adds sgml style name tags to the specified input buffer and outputs this information to stdout.
+ * @param finders The name finders to be used.
+ * @param tags The tag names for the corresponding name finder.
+ * @param input The input reader.
+ * @throws IOException
+ */
+ private static void processText(TreebankNameFinder[] finders, String[] tags, BufferedReader input) throws IOException {
+ Span[][] nameSpans = new Span[finders.length][];
+ String[][] nameOutcomes = new String[finders.length][];
+ opennlp.tools.tokenize.Tokenizer tokenizer = new SimpleTokenizer();
+ StringBuffer output = new StringBuffer();
+ for (String line = input.readLine(); null != line; line = input.readLine()) {
+ if (line.equals("")) {
+ clearPrevTokenMaps(finders);
+ System.out.println();
+ continue;
+ }
+ output.setLength(0);
+ Span[] spans = tokenizer.tokenizePos(line);
+ String[] tokens = Span.spansToStrings(spans,line);
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ nameSpans[fi] = finders[fi].nameFinder.find(tokens);
+ //System.err.println("EnglighNameFinder.processText: "+tags[fi] + " " + java.util.Arrays.asList(finderTags[fi]));
+ nameOutcomes[fi] = NameFinderEventStream.generateOutcomes(nameSpans[fi], null, tokens.length);
+ }
+
+ for (int ti = 0, tl = tokens.length; ti < tl; ti++) {
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ //check for end tags
+ if (ti != 0) {
+ if ((nameOutcomes[fi][ti].equals(NameFinderME.START) || nameOutcomes[fi][ti].equals(NameFinderME.OTHER)) &&
+ (nameOutcomes[fi][ti - 1].equals(NameFinderME.START) || nameOutcomes[fi][ti - 1].equals(NameFinderME.CONTINUE))) {
+ output.append("</").append(tags[fi]).append(">");
+ }
+ }
+ }
+ if (ti > 0 && spans[ti - 1].getEnd() < spans[ti].getStart()) {
+ output.append(line.substring(spans[ti - 1].getEnd(), spans[ti].getStart()));
+ }
+ //check for start tags
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ if (nameOutcomes[fi][ti].equals(NameFinderME.START)) {
+ output.append("<").append(tags[fi]).append(">");
+ }
+ }
+ output.append(tokens[ti]);
+ }
+ //final end tags
+ if (tokens.length != 0) {
+ for (int fi = 0, fl = finders.length; fi < fl; fi++) {
+ if (nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.START) || nameOutcomes[fi][tokens.length - 1].equals(NameFinderME.CONTINUE)) {
+ output.append("</").append(tags[fi]).append(">");
+ }
+ }
+ }
+ if (tokens.length != 0) {
+ if (spans[tokens.length - 1].getEnd() < line.length()) {
+ output.append(line.substring(spans[tokens.length - 1].getEnd()));
+ }
+ }
+ System.out.println(output);
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length == 0) {
+ System.err.println("Usage NameFinder -[parse] model1 model2 ... modelN < sentences");
+ System.err.println(" -parse: Use this option to find names on parsed input. Un-tokenized sentence text is the default.");
+ System.exit(1);
+ }
+ int ai = 0;
+ boolean parsedInput = false;
+ while (args[ai].startsWith("-") && ai < args.length) {
+ if (args[ai].equals("-parse")) {
+ parsedInput = true;
+ }
+ else {
+ System.err.println("Ignoring unknown option "+args[ai]);
+ }
+ ai++;
+ }
+ TreebankNameFinder[] finders = new TreebankNameFinder[args.length-ai];
+ String[] names = new String[args.length-ai];
+ for (int fi=0; ai < args.length; ai++,fi++) {
+ String modelName = args[ai];
+ finders[fi] = new TreebankNameFinder(new TokenNameFinderModel(new FileInputStream(modelName)));
+ int nameStart = modelName.lastIndexOf(System.getProperty("file.separator")) + 1;
+ int nameEnd = modelName.indexOf('.', nameStart);
+ if (nameEnd == -1) {
+ nameEnd = modelName.length();
+ }
+ names[fi] = modelName.substring(nameStart, nameEnd);
+ }
+ //long t1 = System.currentTimeMillis();
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ if (parsedInput) {
+ processParse(finders,names,in);
+ }
+ else {
+ processText(finders,names,in);
+ }
+ //long t2 = System.currentTimeMillis();
+ //System.err.println("Time "+(t2-t1));
+ }
+}
\ No newline at end of file
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/lang/english/TreebankNameFinder.java
------------------------------------------------------------------------------
svn:mime-type = text/plain