You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/03/12 15:51:51 UTC
svn commit: r1576767 - in /opennlp/trunk/opennlp-tools: lang/es/ lang/es/parser/ lang/es/parser/es-head-rules src/main/java/opennlp/tools/parser/lang/es/ src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java

Author: joern
Date: Wed Mar 12 14:51:51 2014
New Revision: 1576767

URL: http://svn.apache.org/r1576767
Log:
OPENNLP-665 Added Spanish head rules file and implementation. Thanks to Rodrigo Agerri for providing a patch.

Added:
    opennlp/trunk/opennlp-tools/lang/es/
    opennlp/trunk/opennlp-tools/lang/es/parser/
    opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java

Added: opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules?rev=1576767&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules (added)
+++ opennlp/trunk/opennlp-tools/lang/es/parser/es-head-rules Wed Mar 12 14:51:51 2014
@@ -0,0 +1,22 @@
+13 SENTENCE 0 PREP SP[CS].* CS.* GRUP\\.VERB S SA COORD CONJ GRUP\\.NOM SN S
+12 S 0 PREP SP[CS].* COORD CONJ CS.* GRUP\\.VERB S SA GRUP\\.NOM SN
+22 SA 0 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* SA S\\.A GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+21 S.A 0 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* S\\.A GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+20 SADV 1 S RG RN SADV GRUP\\.ADV SP[CS].* PREP Z.* AQA.* AQC.* S\\.A GRUP\\.A CONJ CS.* SN GRUP\\.NOM AQS.* NC.*S.*
+8 SP 0 SP[CS].* PREP CS.* CONJ V[MAS]G.* V[MAS]P.*
+20 GRUP.A 1 NC.*P.* GRUP\\.NOM \\$ NC.*S.* SADV GRUP\\.ADV AQA.* AQC.* V[MAS]P.* V[MAS]G.* GRUP\\.A AQS.* SN GRUP\\.NOM D.* S RG RN
+18 GRUP.ADV 0 RG RN GRUP\\.ADV PREP SP.* Z.* AQA.* AQC.* GRUP\\.A S\\.A CS.* CONJ SN GRUP\\.NOM AQS.* NC.*S.*
+23 GRUP.VERB 0 INFINITIU GERUNDI PARTICIPI PREP SP[CS].* V[MAS].*[IS].* V[MAS]P.* V.*C.* V[MAS]IP3S.* V.* V[MAS]G.* V[MAS]IP[12]S.* GRUP\\.VERB SA S\\.A GRUP\\.A NC.*S.* NC.*P.* GRUP\\.NOM SN S
+5 INFINITIU 0 VMN.* V[MAS]N.* V.*
+5 GERUNDI 0 VMG.* V[MAS]G.* V.*
+5 PARTICIPI 0 VMP.* V[MAS]P.* V.*
+6 MORFEMA.PRONOMINAL 0 P.* SN.* GRUP\\.NOM.* GRUP\\.VERB
+7 MORFEMA.VERBAL 0 GRUP\\.VERB P.* SN.* GRUP\\.NOM.* S
+9 COORD 1 CONJ CC.* RB RN SP[CS].* PREP CS
+16 INC 0 S SN GRUP\\.NOM GRUP\\.VERB SADV GRUP.ADV SA S\\.A GRUP\\.A PREP SP[CS].* CONJ CS D.*
+3 INTERJECCIO 0 I
+3 NEG 0 RN
+6 PREP 0 PREP SP[CS].* CONJ CS
+7 RELATIU 0 P.* SN GRUP\\.NOM S GRUP\\.VERB
+2 SPEC 0
+2 X 1

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java?rev=1576767&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/lang/es/AncoraSpanishHeadRules.java Wed Mar 12 14:51:51 2014
@@ -0,0 +1,279 @@
+/*
+ *
+ *Copyright 2013 Rodrigo Agerri
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ */
+
+
+package opennlp.tools.parser.lang.es;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+import java.util.StringTokenizer;
+
+import opennlp.tools.parser.Constituent;
+import opennlp.tools.parser.GapLabeler;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.chunking.Parser;
+
+/**
+ * Class for storing the Ancora Spanish head rules associated with parsing. The headrules
+ * are specified in $src/main/resources/es-head-rules
+ *  
+ * NOTE: This class has been adapted from opennlp.tools.parser.lang.en.HeadRules
+ * 
+ * The main change is the constituents search direction in the first for loop.
+ * 
+ * Note also the change in the return of the getHead() method: In Apache OpenNLP
+ * lang.en.HeadRules class: return constituents[ci].getHead(); Now: return constituents[ci];
+ * 
+ * Other changes include removal of deprecated methods we do not need to use. 
+ * 
+ */
+public class AncoraSpanishHeadRules implements opennlp.tools.parser.HeadRules, GapLabeler {
+
+  private static class HeadRule {
+    public boolean leftToRight;
+    public String[] tags;
+    public HeadRule(boolean l2r, String[] tags) {
+      leftToRight = l2r;
+
+      for (String tag : tags) {
+        if (tag == null)
+            throw new IllegalArgumentException("tags must not contain null values!");
+      }
+
+      this.tags = tags;
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+      if (obj == this) {
+        return true;
+      }
+      else if (obj instanceof HeadRule) {
+        HeadRule rule = (HeadRule) obj;
+        
+        return (rule.leftToRight == leftToRight) && 
+            Arrays.equals(rule.tags, tags);
+      }
+      else {
+        return false;
+      }
+    }
+  }
+
+  private Map<String, HeadRule> headRules;
+  private Set<String> punctSet;
+
+  
+
+  /**
+   * Creates a new set of head rules based on the specified reader.
+   *
+   * @param rulesReader the head rules reader.
+   *
+   * @throws IOException if the head rules reader can not be read.
+   */
+  public AncoraSpanishHeadRules(Reader rulesReader) throws IOException {
+    BufferedReader in = new BufferedReader(rulesReader);
+    readHeadRules(in);
+
+    punctSet = new HashSet<String>();
+    punctSet.add(".");
+    punctSet.add(",");
+    punctSet.add("``");
+    punctSet.add("''");
+    //punctSet.add(":");
+  }
+
+  public Set<String> getPunctuationTags() {
+    return punctSet;
+  }
+
+  public Parse getHead(Parse[] constituents, String type) {
+    if (constituents[0].getType() == Parser.TOK_NODE) {
+      return null;
+    }
+    HeadRule hr;
+      if (type.equals("SN") || type.equals("GRUP.NOM")) {
+      String[] tags1 = {"AQA.*","AQC.*","GRUP\\.A","S\\.A","NC.*S.*", "NP.*","NC.*P.*", "GRUP\\.NOM"};
+      
+      for (int i = 0; i < constituents.length; i++) {
+        for (int t = tags1.length - 1; t >= 0; t--) {
+          if (constituents[i].getType().matches(tags1[t])) {
+            return constituents[i];
+          }
+        }
+      }
+      for (int ci = 0; ci < constituents.length; ci++) {
+        if (constituents[ci].getType().equals("SN") || constituents[ci].getType().equals("GRUP.NOM")) {
+          return constituents[ci];
+        }
+      }
+      String[] tags2 = {"\\$","GRUP\\.A","SA"};
+      for (int ci = constituents.length - 1; ci >= 0; ci--) {
+        for (int ti = tags2.length - 1; ti >= 0; ti--) {
+          if (constituents[ci].getType().matches(tags2[ti])) {
+            return constituents[ci];
+          }
+        }
+      }
+      String[] tags3 = {"AQ0.*", "AQ[AC].*","AO.*","GRUP\\.A","S\\.A","RG","RN","GRUP\\.NOM"};
+      for (int ci = constituents.length - 1; ci >= 0; ci--) {
+        for (int ti = tags3.length - 1; ti >= 0; ti--) {
+          if (constituents[ci].getType().matches(tags3[ti])) {
+            return constituents[ci];
+          }
+        }
+      }
+      return constituents[constituents.length - 1].getHead();
+    }
+    else if ((hr = headRules.get(type)) != null) {
+      String[] tags = hr.tags;
+      int cl = constituents.length;
+      int tl = tags.length;
+      if (hr.leftToRight) {
+        for (int ti = 0; ti < tl; ti++) {
+          for (int ci = 0; ci < cl; ci++) {
+        	 if (constituents[ci].getType().matches(tags[ti])) {
+              return constituents[ci];
+            }
+          }
+        }
+        return constituents[0].getHead();
+      }
+      else {
+        for (int ti = 0; ti < tl; ti++) {
+          for (int ci = cl - 1; ci >= 0; ci--) {
+            if (constituents[ci].getType().matches(tags[ti])) {
+              return constituents[ci];
+            }
+          }
+        }
+        return constituents[cl - 1].getHead();
+      }
+    }
+    return constituents[constituents.length - 1].getHead();
+  }
+
+  private void readHeadRules(BufferedReader str) throws IOException {
+    String line;
+    headRules = new HashMap<String, HeadRule>(60);
+    while ((line = str.readLine()) != null) {
+      StringTokenizer st = new StringTokenizer(line);
+      String num = st.nextToken();
+      String type = st.nextToken();
+      String dir = st.nextToken();
+      String[] tags = new String[Integer.parseInt(num) - 2];
+      int ti = 0;
+      while (st.hasMoreTokens()) {
+        tags[ti] = st.nextToken();
+        ti++;
+      }
+      headRules.put(type, new HeadRule(dir.equals("1"), tags));
+    }
+  }
+
+  public void labelGaps(Stack<Constituent> stack) {
+    if (stack.size() > 4) {
+      //Constituent con0 = (Constituent) stack.get(stack.size()-1);
+      Constituent con1 = stack.get(stack.size()-2);
+      Constituent con2 = stack.get(stack.size()-3);
+      Constituent con3 = stack.get(stack.size()-4);
+      Constituent con4 = stack.get(stack.size()-5);
+      //System.err.println("con0="+con0.label+" con1="+con1.label+" con2="+con2.label+" con3="+con3.label+" con4="+con4.label);
+      //subject extraction
+      if (con1.getLabel().equals("SN") && con2.getLabel().equals("S") && con3.getLabel().equals("GRUP.NOM")) {
+        con1.setLabel(con1.getLabel()+"-G");
+        con2.setLabel(con2.getLabel()+"-G");
+        con3.setLabel(con3.getLabel()+"-G");
+      }
+      //object extraction
+      else if (con1.getLabel().equals("SN") && con2.getLabel().equals("GRUP.VERB") && con3.getLabel().equals("S") && con4.getLabel().equals("GRUP.NOM")) {
+        con1.setLabel(con1.getLabel()+"-G");
+        con2.setLabel(con2.getLabel()+"-G");
+        con3.setLabel(con3.getLabel()+"-G");
+        con4.setLabel(con4.getLabel()+"-G");
+      }
+    }
+  }
+
+  /**
+   * Writes the head rules to the writer in a format suitable for loading
+   * the head rules again with the constructor. The encoding must be
+   * taken into account while working with the writer and reader.
+   * <p> 
+   * After the entries have been written, the writer is flushed.
+   * The writer remains open after this method returns.
+   * 
+   * @param writer
+   * @throws IOException
+   */
+  public void serialize(Writer writer) throws IOException {
+
+    for (String type : headRules.keySet()) {
+
+      HeadRule headRule = headRules.get(type);
+
+      // write num of tags
+      writer.write(Integer.toString(headRule.tags.length + 2));
+      writer.write(' ');
+
+      // write type
+      writer.write(type);
+      writer.write(' ');
+
+      // write l2r true == 1
+      if (headRule.leftToRight)
+        writer.write("1");
+      else
+        writer.write("0");
+
+      // write tags
+      for (String tag : headRule.tags) {
+        writer.write(' ');
+        writer.write(tag);
+      }
+
+      writer.write('\n');
+    }
+    
+    writer.flush();
+  }
+  
+  @Override
+  public boolean equals(Object obj) {
+    if (obj == this) {
+      return true;
+    }
+    else if (obj instanceof AncoraSpanishHeadRules) {
+      AncoraSpanishHeadRules rules = (AncoraSpanishHeadRules) obj;
+      
+      return rules.headRules.equals(headRules) &&
+          rules.punctSet.equals(punctSet);
+    }
+    else {
+      return false;
+    }
+  }
+}