You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/27 19:01:27 UTC
svn commit: r1305904 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java

Author: colen
Date: Tue Mar 27 17:01:27 2012
New Revision: 1305904

URL: http://svn.apache.org/viewvc?rev=1305904&view=rev
Log:
OPENNLP-485: Improved how contractions are handled: some are expanded to more than 2 tokens. Also now we force tokenization of named entities that has punctuations.

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java?rev=1305904&r1=1305903&r2=1305904&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java Tue Mar 27 17:01:27 2012
@@ -22,8 +22,10 @@ import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.regex.Matcher;
@@ -67,6 +69,10 @@ public class ADNameSampleStream implemen
    */
   private static final Pattern tagPattern = Pattern.compile("<(NER:)?(.*?)>");
   
+  private static final Pattern whitespacePattern = Pattern.compile("\\s+");
+  private static final Pattern underlinePattern = Pattern.compile("[_]+");
+  private static final Pattern alphanumericPattern = Pattern.compile("^[\\p{L}\\p{Nd}-]+$");
+
   /** 
    * Map to the Arvores Deitadas types to our types. It is read-only.
    */
@@ -254,7 +260,8 @@ public class ADNameSampleStream implemen
         String c = PortugueseContractionUtility.toContraction(leftContractionPart, right);
 
         if (c != null) {
-          sentence.add(c);
+          String[] parts = whitespacePattern.split(c);
+          sentence.addAll(Arrays.asList(parts));
         } else {
           // contraction was missing!
           sentence.add(leftContractionPart);
@@ -276,7 +283,7 @@ public class ADNameSampleStream implemen
 
       if (leafTag != null) {
         if (leafTag.contains("<sam->") && !alreadyAdded) {
-          String[] lexemes = leaf.getLexeme().split("_");
+          String[] lexemes = underlinePattern.split(leaf.getLexeme());
           if(lexemes.length > 1) {
              sentence.addAll(Arrays.asList(lexemes).subList(0, lexemes.length - 1));
           }
@@ -295,7 +302,7 @@ public class ADNameSampleStream implemen
       }
 
       if(!alreadyAdded) {
-        sentence.addAll(Arrays.asList(leaf.getLexeme().split("_")));
+        sentence.addAll(processLexeme(leaf.getLexeme()));
       }
       
       if (namedEntityTag != null) {
@@ -306,7 +313,7 @@ public class ADNameSampleStream implemen
       if (expandLastNER) {
         // if the current leaf has the tag <NER2>, it can be the continuation of
         // a NER.
-        // we check if it is true, and expand the lest NER
+        // we check if it is true, and expand the last NER
         int lastIndex = names.size() - 1;
         Span last = null;
         boolean error = false;
@@ -330,11 +337,42 @@ public class ADNameSampleStream implemen
 
     }
 
-  
-
-
-
+  private List<String> processLexeme(String lexemeStr) {
+    List<String> out = new ArrayList<String>();
+    String[] parts = underlinePattern.split(lexemeStr);
+    for (String tok : parts) {
+      if(tok.length() > 1 && !alphanumericPattern.matcher(tok).matches()) {
+        out.addAll(processTok(tok));
+      } else {
+        out.add(tok);
+      }
+    }
+    return out;
+  }
 
+  private Collection<? extends String> processTok(String tok) {
+    String original = tok;
+    List<String> out = new ArrayList<String>();
+    LinkedList<String> suffix = new LinkedList<String>();
+    char first = tok.charAt(0);
+    if (first == 'Â«') {
+      out.add(Character.toString(first));
+      tok = tok.substring(1);
+    }
+    char last = tok.charAt(tok.length() - 1);
+    if (last == 'Â»' || last == ':' || last == ',' || last == '!' ) {
+      suffix.add(Character.toString(last));
+      tok = tok.substring(0, tok.length() - 1);
+    }
+    
+    if(!original.equals(tok) && tok.length() > 1 && !alphanumericPattern.matcher(tok).matches()) {
+      out.addAll(processTok(tok));
+    } else {
+      out.add(tok);
+    }
+    out.addAll(suffix);
+    return out;
+  }
 
   /**
    * Parse a NER tag in Arvores Deitadas format.