You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/24 10:59:55 UTC
svn commit: r1413167 - in
/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner: ./
src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/
Author: rwesten
Date: Sat Nov 24 09:59:54 2012
New Revision: 1413167
URL: http://svn.apache.org/viewvc?rev=1413167&view=rev
Log:
STANBOL-733: merged changes of the opennlp-ner engine from trunk
Modified:
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/ (props changed)
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
Propchange: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/
------------------------------------------------------------------------------
Merged /stanbol/trunk/enhancer/engines/opennlp-ner:r1405731-1413162
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/pom.xml Sat Nov 24 09:59:54 2012
@@ -143,13 +143,13 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.data.opennlp.lang.en</artifactId>
- <version>1.0.2-SNAPSHOT</version>
+ <version>1.1.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.data.opennlp.ner.en</artifactId>
- <version>1.0.2-SNAPSHOT</version>
+ <version>1.1.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Sat Nov 24 09:59:54 2012
@@ -597,17 +597,26 @@ public abstract class NEREngineCore
if (null == text) {
return null;
}
- Charset UTF8 = Charset.forName("UTF-8");
- byte[] bytes = text.getBytes(UTF8);
- for (int i = 0; i < bytes.length; i++) {
- byte ch = bytes[i];
+ StringBuilder sb = null; //initialised on the first replacement
+ for (int i = 0; i < text.length(); i++) {
+ int ch = text.codePointAt(i);
// remove any characters outside the valid UTF-8 range as well as all control characters
// except tabs and new lines
- if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == '\r')) {
- bytes[i] = ' ';
+ //NOTE: rewesten (2012-11-21) replaced the original check with the one
+ // found at http://blog.mark-mclaren.info/2007/02/invalid-xml-characters-when-valid-utf8_5873.html
+ if (!((ch == 0x9) ||
+ (ch == 0xA) ||
+ (ch == 0xD) ||
+ ((ch >= 0x20) && (ch <= 0xD7FF)) ||
+ ((ch >= 0xE000) && (ch <= 0xFFFD)) ||
+ ((ch >= 0x10000) && (ch <= 0x10FFFF)))){
+ if(sb == null){
+ sb = new StringBuilder(text);
+ }
+ sb.setCharAt(i, ' ');
}
}
- return new String(bytes, UTF8);
+ return sb == null ? text : sb.toString();
}
/**
Modified: stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java?rev=1413167&r1=1413166&r2=1413167&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java (original)
+++ stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-ner/src/test/java/org/apache/stanbol/enhancer/engines/opennlp/impl/TestNamedEntityExtractionEnhancementEngine.java Sat Nov 24 09:59:54 2012
@@ -51,7 +51,7 @@ public class TestNamedEntityExtractionEn
+ " geologist who lived in New Zealand and worked at the University of Otago.";
public static final String SINGLE_SENTENCE_WITH_CONTROL_CHARS = "Dr Patrick Marshall (1869 - November 1950) was a"
- + " \u0014geologist\u0015 who lived in New Zealand and worked at the University of Otago.";
+ + " \u0014geologist\u0015 who lived in New\tZealand and worked at the University\nof Otago.";
public static final String MULTI_SENTENCES = "The life of Patrick Marshall\n\n"
+ "Dr Patrick Marshall (1869 - November 1950) was a"
@@ -135,6 +135,8 @@ public class TestNamedEntityExtractionEn
NameOccurrence firstOccurrence = pmOccurrences.get(0);
assertEquals("Patrick Marshall", firstOccurrence.name);
assertFalse(firstOccurrence.context.contains("\u0014"));
+ assertTrue(firstOccurrence.context.contains("\t"));
+ assertTrue(firstOccurrence.context.contains("\n"));
}
@Test