You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2011/06/01 10:45:32 UTC

svn commit: r1130051 - /incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java

Author: wkasper
Date: Wed Jun  1 08:45:31 2011
New Revision: 1130051

URL: http://svn.apache.org/viewvc?rev=1130051&view=rev
Log:
Stanbol-213: The OpenNLP-NER can use plain text extracted by Metaxa

Modified:
    incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java

Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1130051&r1=1130050&r2=1130051&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Wed Jun  1 08:45:31 2011
@@ -23,6 +23,7 @@ import static org.apache.stanbol.enhance
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -30,6 +31,7 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -45,6 +47,7 @@ import opennlp.tools.util.Span;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
@@ -103,11 +106,19 @@ public class NEREngineCore implements En
     }
 
     public void computeEnhancements(ContentItem ci) throws EngineException {
-        String text;
-        try {
-            text = IOUtils.toString(ci.getStream(), "UTF-8");
-        } catch (IOException e) {
-            throw new InvalidContentException(this, ci, e);
+        String mimeType = ci.getMimeType().split(";", 2)[0];
+        String text = "";
+        if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
+            try {
+                text = IOUtils.toString(ci.getStream(),"UTF-8");
+            } catch (IOException e) {
+                throw new InvalidContentException(this, ci, e);
+            }
+        } else {
+            Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getId()), NIE_PLAINTEXTCONTENT, null);
+            while (it.hasNext()) {
+                text += it.next().getObject();
+            }
         }
         if (text.trim().length() == 0) {
             // TODO: make the length of the data a field of the ContentItem
@@ -313,6 +324,12 @@ public class NEREngineCore implements En
         if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
             return ENHANCE_SYNCHRONOUS;
         }
+        // check for existence of textual content in metadata
+        UriRef subj = new UriRef(ci.getId());
+        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
+        if (it.hasNext()) {
+            return ENHANCE_SYNCHRONOUS;
+        }
         return CANNOT_ENHANCE;
     }