You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/03/12 15:51:51 UTC
svn commit: r1576767 - in /opennlp/trunk/opennlp-tools: lang/es/
lang/es/parser/ lang/es/parser/es-head-rules
src/main/java/opennlp/tools/parser/lang/es/
src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
Author: joern
Date: Wed Mar 12 14:51:51 2014
New Revision: 1576767
URL: http://svn.apache.org/r1576767
Log:
OPENNLP-665 Added Spanish head rules file and implementation. Thanks to Rodrigo Agerri for providing a patch.
Added:
opennlp/trunk/opennlp-tools/lang/es/
opennlp/trunk/opennlp-tools/lang/es/parser/
opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
Added: opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules?rev=1576767&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules (added)
+++ opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules Wed Mar 12 14:51:51 2014
@@ -0,0 +1,22 @@
+13 SENTENCE 0 PREP SP[CS].* CS.* GRUP\\.VERB S SA COORD CONJ GRUP\\.NOM SN S
+12 S 0 PREP SP[CS].* COORD CONJ CS.* GRUP\\.VERB S SA GRUP\\.NOM SN
+22 SA 0 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* SA S\\.A GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+21 S.A 0 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* S\\.A GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+20 SADV 1 S RG RN SADV GRUP\\.ADV SP[CS].* PREP Z.* AQA.* AQC.* S\\.A GRUP\\.A CONJ CS.* SN GRUP\\.NOM AQS.* NC.*S.*
+8 SP 0 SP[CS].* PREP CS.* CONJ V[MAS]G.* V[MAS]P.*
+20 GRUP.A 1 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+18 GRUP.ADV 0 RG RN GRUP\\.ADV PREP SP.* Z.* AQA.* AQC.* GRUP\\.A S\\.A CS.* CONJ SN GRUP\\.NOM AQS.* NC.*S.*
+23 GRUP.VERB 0 INFINITIU GERUNDI PARTICIPI PREP SP[CS].* V[MAS].*[IS].* V[MAS]P.* V.*C.* V[MAS]IP3S.* V.* V[MAS]G.* V[MAS]IP[12]S.* GRUP\\.VERB SA S\\.A GRUP\\.A NC.*S.* NC.*P.* GRUP\\.NOM SN S
+5 INFINITIU 0 VMN.* V[MAS]N.* V.*
+5 GERUNDI 0 VMG.* V[MAS]G.* V.*
+5 PARTICIPI 0 VMP.* V[MAS]P.* V.*
+6 MORFEMA.PRONOMINAL 0 P.* SN.* GRUP\\.NOM.* GRUP\\.VERB
+7 MORFEMA.VERBAL 0 GRUP\\.VERB P.* SN.* GRUP\\.NOM.* S
+9 COORD 1 CONJ CC.* RB RN SP[CS].* PREP CS
+16 INC 0 S SN GRUP\\.NOM GRUP\\.VERB SADV GRUP.ADV SA S\\.A GRUP\\.A PREP SP[CS].* CONJ CS D.*
+3 INTERJECCIO 0 I
+3 NEG 0 RN
+6 PREP 0 PREP SP[CS].* CONJ CS
+7 RELATIU 0 P.* SN GRUP\\.NOM S GRUP\\.VERB
+2 SPEC 0
+2 X 1
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java?rev=1576767&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java Wed Mar 12 14:51:51 2014
@@ -0,0 +1,279 @@
+/*
+ *
+ *Copyright 2013 Rodrigo Agerri
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+
+package opennlp.tools.parser.lang.es;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+import java.util.StringTokenizer;
+
+import opennlp.tools.parser.Constituent;
+import opennlp.tools.parser.GapLabeler;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.chunking.Parser;
+
+/**
+ * Class for storing the Ancora Spanish head rules associated with parsing. The headrules
+ * are specified in $src/main/resources/es-head-rules
+ *
+ * NOTE: This class has been adapted from opennlp.tools.parser.lang.en.HeadRules
+ *
+ * The main change is the constituents search direction in the first for loop.
+ *
+ * Note also the change in the return of the getHead() method: In Apache OpenNLP
+ * lang.en.HeadRules class: return constituents[ci].getHead(); Now: return constituents[ci];
+ *
+ * Other changes include removal of deprecated methods we do not need to use.
+ *
+ */
+public class AncoraSpanishHeadRules implements opennlp.tools.parser.HeadRules, GapLabeler {
+
+ private static class HeadRule {
+ public boolean leftToRight;
+ public String[] tags;
+ public HeadRule(boolean l2r, String[] tags) {
+ leftToRight = l2r;
+
+ for (String tag : tags) {
+ if (tag == null)
+ throw new IllegalArgumentException("tags must not contain null values!");
+ }
+
+ this.tags = tags;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == this) {
+ return true;
+ }
+ else if (obj instanceof HeadRule) {
+ HeadRule rule = (HeadRule) obj;
+
+ return (rule.leftToRight == leftToRight) &&
+ Arrays.equals(rule.tags, tags);
+ }
+ else {
+ return false;
+ }
+ }
+ }
+
+ private Map<String, HeadRule> headRules;
+ private Set<String> punctSet;
+
+
+
+ /**
+ * Creates a new set of head rules based on the specified reader.
+ *
+ * @param rulesReader the head rules reader.
+ *
+ * @throws IOException if the head rules reader can not be read.
+ */
+ public AncoraSpanishHeadRules(Reader rulesReader) throws IOException {
+ BufferedReader in = new BufferedReader(rulesReader);
+ readHeadRules(in);
+
+ punctSet = new HashSet<String>();
+ punctSet.add(".");
+ punctSet.add(",");
+ punctSet.add("``");
+ punctSet.add("''");
+ //punctSet.add(":");
+ }
+
+ public Set<String> getPunctuationTags() {
+ return punctSet;
+ }
+
+ public Parse getHead(Parse[] constituents, String type) {
+ if (constituents[0].getType() == Parser.TOK_NODE) {
+ return null;
+ }
+ HeadRule hr;
+ if (type.equals("SN") || type.equals("GRUP.NOM")) {
+ String[] tags1 = {"AQA.*","AQC.*","GRUP\\.A","S\\.A","NC.*S.*", "NP.*","NC.*P.*", "GRUP\\.NOM"};
+
+ for (int i = 0; i < constituents.length; i++) {
+ for (int t = tags1.length - 1; t >= 0; t--) {
+ if (constituents[i].getType().matches(tags1[t])) {
+ return constituents[i];
+ }
+ }
+ }
+ for (int ci = 0; ci < constituents.length; ci++) {
+ if (constituents[ci].getType().equals("SN") || constituents[ci].getType().equals("GRUP.NOM")) {
+ return constituents[ci];
+ }
+ }
+ String[] tags2 = {"\\$","GRUP\\.A","SA"};
+ for (int ci = constituents.length - 1; ci >= 0; ci--) {
+ for (int ti = tags2.length - 1; ti >= 0; ti--) {
+ if (constituents[ci].getType().matches(tags2[ti])) {
+ return constituents[ci];
+ }
+ }
+ }
+ String[] tags3 = {"AQ0.*", "AQ[AC].*","AO.*","GRUP\\.A","S\\.A","RG","RN","GRUP\\.NOM"};
+ for (int ci = constituents.length - 1; ci >= 0; ci--) {
+ for (int ti = tags3.length - 1; ti >= 0; ti--) {
+ if (constituents[ci].getType().matches(tags3[ti])) {
+ return constituents[ci];
+ }
+ }
+ }
+ return constituents[constituents.length - 1].getHead();
+ }
+ else if ((hr = headRules.get(type)) != null) {
+ String[] tags = hr.tags;
+ int cl = constituents.length;
+ int tl = tags.length;
+ if (hr.leftToRight) {
+ for (int ti = 0; ti < tl; ti++) {
+ for (int ci = 0; ci < cl; ci++) {
+ if (constituents[ci].getType().matches(tags[ti])) {
+ return constituents[ci];
+ }
+ }
+ }
+ return constituents[0].getHead();
+ }
+ else {
+ for (int ti = 0; ti < tl; ti++) {
+ for (int ci = cl - 1; ci >= 0; ci--) {
+ if (constituents[ci].getType().matches(tags[ti])) {
+ return constituents[ci];
+ }
+ }
+ }
+ return constituents[cl - 1].getHead();
+ }
+ }
+ return constituents[constituents.length - 1].getHead();
+ }
+
+ private void readHeadRules(BufferedReader str) throws IOException {
+ String line;
+ headRules = new HashMap<String, HeadRule>(60);
+ while ((line = str.readLine()) != null) {
+ StringTokenizer st = new StringTokenizer(line);
+ String num = st.nextToken();
+ String type = st.nextToken();
+ String dir = st.nextToken();
+ String[] tags = new String[Integer.parseInt(num) - 2];
+ int ti = 0;
+ while (st.hasMoreTokens()) {
+ tags[ti] = st.nextToken();
+ ti++;
+ }
+ headRules.put(type, new HeadRule(dir.equals("1"), tags));
+ }
+ }
+
+ public void labelGaps(Stack<Constituent> stack) {
+ if (stack.size() > 4) {
+ //Constituent con0 = (Constituent) stack.get(stack.size()-1);
+ Constituent con1 = stack.get(stack.size()-2);
+ Constituent con2 = stack.get(stack.size()-3);
+ Constituent con3 = stack.get(stack.size()-4);
+ Constituent con4 = stack.get(stack.size()-5);
+ //System.err.println("con0="+con0.label+" con1="+con1.label+" con2="+con2.label+" con3="+con3.label+" con4="+con4.label);
+ //subject extraction
+ if (con1.getLabel().equals("SN") && con2.getLabel().equals("S") && con3.getLabel().equals("GRUP.NOM")) {
+ con1.setLabel(con1.getLabel()+"-G");
+ con2.setLabel(con2.getLabel()+"-G");
+ con3.setLabel(con3.getLabel()+"-G");
+ }
+ //object extraction
+ else if (con1.getLabel().equals("SN") && con2.getLabel().equals("GRUP.VERB") && con3.getLabel().equals("S") && con4.getLabel().equals("GRUP.NOM")) {
+ con1.setLabel(con1.getLabel()+"-G");
+ con2.setLabel(con2.getLabel()+"-G");
+ con3.setLabel(con3.getLabel()+"-G");
+ con4.setLabel(con4.getLabel()+"-G");
+ }
+ }
+ }
+
+ /**
+ * Writes the head rules to the writer in a format suitable for loading
+ * the head rules again with the constructor. The encoding must be
+ * taken into account while working with the writer and reader.
+ * <p>
+ * After the entries have been written, the writer is flushed.
+ * The writer remains open after this method returns.
+ *
+ * @param writer
+ * @throws IOException
+ */
+ public void serialize(Writer writer) throws IOException {
+
+ for (String type : headRules.keySet()) {
+
+ HeadRule headRule = headRules.get(type);
+
+ // write num of tags
+ writer.write(Integer.toString(headRule.tags.length + 2));
+ writer.write(' ');
+
+ // write type
+ writer.write(type);
+ writer.write(' ');
+
+ // write l2r true == 1
+ if (headRule.leftToRight)
+ writer.write("1");
+ else
+ writer.write("0");
+
+ // write tags
+ for (String tag : headRule.tags) {
+ writer.write(' ');
+ writer.write(tag);
+ }
+
+ writer.write('\n');
+ }
+
+ writer.flush();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == this) {
+ return true;
+ }
+ else if (obj instanceof AncoraSpanishHeadRules) {
+ AncoraSpanishHeadRules rules = (AncoraSpanishHeadRules) obj;
+
+ return rules.headRules.equals(headRules) &&
+ rules.punctSet.equals(punctSet);
+ }
+ else {
+ return false;
+ }
+ }
+}