You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by og...@apache.org on 2011/04/19 14:32:39 UTC
svn commit: r1095062 - in /incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src: main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/

Author: ogrisel
Date: Tue Apr 19 12:32:39 2011
New Revision: 1095062

URL: http://svn.apache.org/viewvc?rev=1095062&view=rev
Log:
STANBOL-176: NER engine should not put control chars in text literals of the annotation graph

Modified:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1095062&r1=1095061&r2=1095062&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Tue Apr 19 12:32:39 2011
@@ -26,6 +26,7 @@ import static org.apache.stanbol.enhance
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@@ -236,6 +237,7 @@ public class NEREngineCore implements En
         // version with explicit sentence endings to reflect heading / paragraph
         // structure of an HTML or PDF document converted to text
         String textWithDots = text.replaceAll("\\n\\n", ".\n");
+        text = removeNonUtf8CompliantCharacters(text);
 
         SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
 
@@ -313,4 +315,24 @@ public class NEREngineCore implements En
         }
         return CANNOT_ENHANCE;
     }
+
+    /**
+     * Remove non UTF-8 compliant characters (typically control characters) so has to avoid polluting the
+     * annotation graph with snippets that are not serializable as XML.
+     */
+    protected static String removeNonUtf8CompliantCharacters(final String text) {
+        if (null == text) {
+            return null;
+        }
+        byte[] bytes = text.getBytes(Charset.forName("UTF-8"));
+        for (int i = 0; i < bytes.length; i++) {
+            byte ch = bytes[i];
+            // remove any characters outside the valid UTF-8 range as well as all control characters
+            // except tabs and new lines
+            if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == '\r')) {
+                bytes[i] = ' ';
+            }
+        }
+        return new String(bytes);
+    }
 }

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1095062&r1=1095061&r2=1095062&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Tue Apr 19 12:32:39 2011
@@ -49,6 +49,9 @@ public class TestNamedEntityExtractionEn
 
     public static final String SINGLE_SENTENCE = "Dr Patrick Marshall (1869 - November 1950) was a"
             + " geologist who lived in New Zealand and worked at the University of Otago.";
+    
+    public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr Patrick Marshall (1869 - November 1950) was a" 
+    		+ " \u0014geologist\u0015 who lived in New Zealand and worked at the University of Otago.";
 
     public static final String MULTI_SENTENCES = "The life of Patrick Marshall\n\n"
             + "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -120,6 +123,20 @@ public class TestNamedEntityExtractionEn
     }
 
     @Test
+    public void testPersonNameOccurrencesExtractionWithControlChars() {
+        Map<String, List<NameOccurrence>> nameOccurrences = nerEngine.extractPersonNameOccurrences(SINGLE_SENTENCE_WITH_CONTROL_CHARS);
+        assertEquals(1, nameOccurrences.size());
+
+        List<NameOccurrence> pmOccurrences = nameOccurrences.get("Patrick Marshall");
+        assertNotNull(pmOccurrences);
+        assertEquals(1, pmOccurrences.size());
+
+        NameOccurrence firstOccurrence = pmOccurrences.get(0);
+        assertEquals("Patrick Marshall", firstOccurrence.name);
+        assertFalse(firstOccurrence.context.contains("\u0014"));
+    }
+    
+    @Test
     public void testLocationNamesExtraction() {
         Collection<String> names = nerEngine.extractLocationNames(SINGLE_SENTENCE);
         assertEquals(1, names.size());