You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wk...@apache.org on 2011/06/08 09:34:43 UTC

svn commit: r1133268 - /incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java

Author: wkasper
Date: Wed Jun  8 07:34:43 2011
New Revision: 1133268

URL: http://svn.apache.org/viewvc?rev=1133268&view=rev
Log:
Stanbol-213: Enable use of text extracts contained in metadata

Modified:
    incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java

Modified: incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java?rev=1133268&r1=1133267&r2=1133268&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java (original)
+++ incubator/stanbol/trunk/enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/RelatedTopicEnhancementEngine.java Wed Jun  8 07:34:43 2011
@@ -16,14 +16,18 @@
  */
 package org.apache.stanbol.enhancer.engines.autotagging.impl;
 
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
 
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
+import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
@@ -74,11 +78,19 @@ public class RelatedTopicEnhancementEngi
                     + ci.getId());
             return;
         }
-        String text;
-        try {
-            text = IOUtils.toString(ci.getStream(),"UTF-8");
-        } catch (IOException e) {
-            throw new InvalidContentException(this, ci, e);
+        String mimeType = ci.getMimeType().split(";", 2)[0];
+        String text = "";
+        if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) {
+            try {
+                text = IOUtils.toString(ci.getStream(),"UTF-8");
+            } catch (IOException e) {
+                throw new InvalidContentException(this, ci, e);
+            }
+        } else {
+            Iterator<Triple> it = ci.getMetadata().filter(new UriRef(ci.getId()), NIE_PLAINTEXTCONTENT, null);
+            while (it.hasNext()) {
+                text += it.next().getObject();
+            }
         }
         if (text.trim().length() == 0) {
             // TODO: make the length of the data a field of the ContentItem
@@ -109,6 +121,12 @@ public class RelatedTopicEnhancementEngi
         if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
             return ENHANCE_SYNCHRONOUS;
         }
+        // check for existence of textual content in metadata
+        UriRef subj = new UriRef(ci.getId());
+        Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null);
+        if (it.hasNext()) {
+            return ENHANCE_SYNCHRONOUS;
+        }
         return CANNOT_ENHANCE;
     }