You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:59:55 UTC

svn commit: r1413167 - in /stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner: ./ src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/ src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/

Author: rwesten
Date: Sat Nov 24 09:59:54 2012
New Revision: 1413167

URL: http://svn.apache.org/viewvc?rev=1413167&view=rev
Log:
STANBOL-733: merged changes of the opennlp-ner engine from trunk

Modified:
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/   (props changed)
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
    stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java

Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/
------------------------------------------------------------------------------
  Merged /stanbol/trunk/enhancer/engines/opennlp-ner:r1405731-1413162

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml Sat Nov 24 09:59:54 2012
@@ -143,13 +143,13 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.data.opennlp.lang.en</artifactId>
-      <version>1.0.2-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.data.opennlp.ner.en</artifactId>
-      <version>1.0.2-SNAPSHOT</version>
+      <version>1.1.0-SNAPSHOT</version>
       <scope>test</scope>
     </dependency>
     <dependency>

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Sat Nov 24 09:59:54 2012
@@ -597,17 +597,26 @@ public abstract class NEREngineCore 
         if (null == text) {
             return null;
         }
-        Charset UTF8 = Charset.forName("UTF-8");
-        byte[] bytes = text.getBytes(UTF8);
-        for (int i = 0; i < bytes.length; i++) {
-            byte ch = bytes[i];
+        StringBuilder sb = null; //initialised on the first replacement
+        for (int i = 0; i < text.length(); i++) {
+            int ch = text.codePointAt(i);
             // remove any characters outside the valid UTF-8 range as well as all control characters
             // except tabs and new lines
-            if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == '\r')) {
-                bytes[i] = ' ';
+            //NOTE: rewesten (2012-11-21) replaced the original check with the one
+            // found at http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
+            if (!((ch == 0x9) ||
+                    (ch == 0xA) ||
+                    (ch == 0xD) ||
+                    ((ch >= 0x20) && (ch <= 0xD7FF)) ||
+                    ((ch >= 0xE000) && (ch <= 0xFFFD)) ||
+                    ((ch >= 0x10000) && (ch <= 0x10FFFF)))){
+                if(sb == null){
+                    sb = new StringBuilder(text);
+                }
+                sb.setCharAt(i, ' ');
             }
         }
-        return new String(bytes, UTF8);
+        return sb == null ? text : sb.toString();
     }
 
     /**

Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Sat Nov 24 09:59:54 2012
@@ -51,7 +51,7 @@ public class TestNamedEntityExtractionEn
             + " geologist who lived in New Zealand and worked at the University of Otago.";
     
     public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr Patrick Marshall (1869 - November 1950) was a" 
-    		+ " \u0014geologist\u0015 who lived in New Zealand and worked at the University of Otago.";
+    		+ " \u0014geologist\u0015 who lived in New\tZealand and worked at the University\nof Otago.";
 
     public static final String MULTI_SENTENCES = "The life of Patrick Marshall\n\n"
             + "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -135,6 +135,8 @@ public class TestNamedEntityExtractionEn
         NameOccurrence firstOccurrence = pmOccurrences.get(0);
         assertEquals("Patrick Marshall", firstOccurrence.name);
         assertFalse(firstOccurrence.context.contains("\u0014"));
+        assertTrue(firstOccurrence.context.contains("\t"));
+        assertTrue(firstOccurrence.context.contains("\n"));
     }
     
     @Test