You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2011/04/19 14:32:39 UTC
svn commit: r1095062 - in
/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src:
main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
Author: ogrisel
Date: Tue Apr 19 12:32:39 2011
New Revision: 1095062
URL: http://svn.apache.org/viewvc?rev=1095062&view=rev
Log:
STANBOL-176: NER engine should not put control chars in text literals of the annotation graph
Modified:
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1095062&r1=1095061&r2=1095062&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Tue Apr 19 12:32:39 2011
@@ -26,6 +26,7 @@ import static org.apache.stanbol.enhance
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
@@ -236,6 +237,7 @@ public class NEREngineCore implements En
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
+ text = removeNonUtf8CompliantCharacters(text);
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
@@ -313,4 +315,24 @@ public class NEREngineCore implements En
}
return CANNOT_ENHANCE;
}
+
+ /**
+ * Remove non UTF-8 compliant characters (typically control characters) so has to avoid polluting the
+ * annotation graph with snippets that are not serializable as XML.
+ */
+ protected static String removeNonUtf8CompliantCharacters(final String text) {
+ if (null == text) {
+ return null;
+ }
+ byte[] bytes = text.getBytes(Charset.forName("UTF-8"));
+ for (int i = 0; i < bytes.length; i++) {
+ byte ch = bytes[i];
+ // remove any characters outside the valid UTF-8 range as well as all control characters
+ // except tabs and new lines
+ if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == '\r')) {
+ bytes[i] = ' ';
+ }
+ }
+ return new String(bytes);
+ }
}
Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1095062&r1=1095061&r2=1095062&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Tue Apr 19 12:32:39 2011
@@ -49,6 +49,9 @@ public class TestNamedEntityExtractionEn
public static final String SINGLE_SENTENCE = "Dr Patrick Marshall (1869 - November 1950) was a"
+ " geologist who lived in New Zealand and worked at the University of Otago.";
+
+ public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr Patrick Marshall (1869 - November 1950) was a"
+ + " \u0014geologist\u0015 who lived in New Zealand and worked at the University of Otago.";
public static final String MULTI_SENTENCES = "The life of Patrick Marshall\n\n"
+ "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -120,6 +123,20 @@ public class TestNamedEntityExtractionEn
}
@Test
+ public void testPersonNameOccurrencesExtractionWithControlChars() {
+ Map<String, List<NameOccurrence>> nameOccurrences = nerEngine.extractPersonNameOccurrences(SINGLE_SENTENCE_WITH_CONTROL_CHARS);
+ assertEquals(1, nameOccurrences.size());
+
+ List<NameOccurrence> pmOccurrences = nameOccurrences.get("Patrick Marshall");
+ assertNotNull(pmOccurrences);
+ assertEquals(1, pmOccurrences.size());
+
+ NameOccurrence firstOccurrence = pmOccurrences.get(0);
+ assertEquals("Patrick Marshall", firstOccurrence.name);
+ assertFalse(firstOccurrence.context.contains("\u0014"));
+ }
+
+ @Test
public void testLocationNamesExtraction() {
Collection<String> names = nerEngine.extractLocationNames(SINGLE_SENTENCE);
assertEquals(1, names.size());