You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/27 19:01:27 UTC
svn commit: r1305904 -
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
Author: colen
Date: Tue Mar 27 17:01:27 2012
New Revision: 1305904
URL: http://svn.apache.org/viewvc?rev=1305904&view=rev
Log:
OPENNLP-485: Improved how contractions are handled: some are expanded to more than 2 tokens. Also now we force tokenization of named entities that has punctuations.
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java?rev=1305904&r1=1305903&r2=1305904&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java Tue Mar 27 17:01:27 2012
@@ -22,8 +22,10 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
@@ -67,6 +69,10 @@ public class ADNameSampleStream implemen
*/
private static final Pattern tagPattern = Pattern.compile("<(NER:)?(.*?)>");
+ private static final Pattern whitespacePattern = Pattern.compile("\\s+");
+ private static final Pattern underlinePattern = Pattern.compile("[_]+");
+ private static final Pattern alphanumericPattern = Pattern.compile("^[\\p{L}\\p{Nd}-]+$");
+
/**
* Map to the Arvores Deitadas types to our types. It is read-only.
*/
@@ -254,7 +260,8 @@ public class ADNameSampleStream implemen
String c = PortugueseContractionUtility.toContraction(leftContractionPart, right);
if (c != null) {
- sentence.add(c);
+ String[] parts = whitespacePattern.split(c);
+ sentence.addAll(Arrays.asList(parts));
} else {
// contraction was missing!
sentence.add(leftContractionPart);
@@ -276,7 +283,7 @@ public class ADNameSampleStream implemen
if (leafTag != null) {
if (leafTag.contains("<sam->") && !alreadyAdded) {
- String[] lexemes = leaf.getLexeme().split("_");
+ String[] lexemes = underlinePattern.split(leaf.getLexeme());
if(lexemes.length > 1) {
sentence.addAll(Arrays.asList(lexemes).subList(0, lexemes.length - 1));
}
@@ -295,7 +302,7 @@ public class ADNameSampleStream implemen
}
if(!alreadyAdded) {
- sentence.addAll(Arrays.asList(leaf.getLexeme().split("_")));
+ sentence.addAll(processLexeme(leaf.getLexeme()));
}
if (namedEntityTag != null) {
@@ -306,7 +313,7 @@ public class ADNameSampleStream implemen
if (expandLastNER) {
// if the current leaf has the tag <NER2>, it can be the continuation of
// a NER.
- // we check if it is true, and expand the lest NER
+ // we check if it is true, and expand the last NER
int lastIndex = names.size() - 1;
Span last = null;
boolean error = false;
@@ -330,11 +337,42 @@ public class ADNameSampleStream implemen
}
-
-
-
-
+ private List<String> processLexeme(String lexemeStr) {
+ List<String> out = new ArrayList<String>();
+ String[] parts = underlinePattern.split(lexemeStr);
+ for (String tok : parts) {
+ if(tok.length() > 1 && !alphanumericPattern.matcher(tok).matches()) {
+ out.addAll(processTok(tok));
+ } else {
+ out.add(tok);
+ }
+ }
+ return out;
+ }
+ private Collection<? extends String> processTok(String tok) {
+ String original = tok;
+ List<String> out = new ArrayList<String>();
+ LinkedList<String> suffix = new LinkedList<String>();
+ char first = tok.charAt(0);
+ if (first == '«') {
+ out.add(Character.toString(first));
+ tok = tok.substring(1);
+ }
+ char last = tok.charAt(tok.length() - 1);
+ if (last == '»' || last == ':' || last == ',' || last == '!' ) {
+ suffix.add(Character.toString(last));
+ tok = tok.substring(0, tok.length() - 1);
+ }
+
+ if(!original.equals(tok) && tok.length() > 1 && !alphanumericPattern.matcher(tok).matches()) {
+ out.addAll(processTok(tok));
+ } else {
+ out.add(tok);
+ }
+ out.addAll(suffix);
+ return out;
+ }
/**
* Parse a NER tag in Arvores Deitadas format.