You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by rf...@apache.org on 2011/12/02 02:30:13 UTC

svn commit: r1209332 - in /incubator/droids/trunk: droids-core/src/main/java/org/apache/droids/api/ droids-core/src/main/java/org/apache/droids/parse/html/ droids-tika/src/main/java/org/apache/droids/tika/

Author: rfrovarp
Date: Fri Dec  2 02:30:10 2011
New Revision: 1209332

URL: http://svn.apache.org/viewvc?rev=1209332&view=rev
Log:
Apply the patch from DROIDS-158.
Thank you to Tobias Rübner for reporting and providing a patch.

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Fri Dec  2 02:30:10 2011
@@ -33,9 +33,9 @@ public interface Parser {
    * 
    * @param entity
    *                the underlying stream we are using
-   * @param link
-   *                the link that correspond to the stream
+   * @param task
+   *                the task that correspond to the stream
    * @return the parse object
    */
-  Parse parse(ContentEntity entity, Link link) throws DroidsException, IOException;
+  Parse parse(ContentEntity entity, Task task) throws DroidsException, IOException;
 }

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Fri Dec  2 02:30:10 2011
@@ -27,6 +27,7 @@ import org.apache.droids.api.ContentEnti
 import org.apache.droids.api.Link;
 import org.apache.droids.api.Parse;
 import org.apache.droids.api.Parser;
+import org.apache.droids.api.Task;
 import org.apache.droids.exception.ContentFormatViolationException;
 import org.apache.droids.exception.DroidsException;
 import org.apache.droids.parse.ParseImpl;
@@ -65,12 +66,12 @@ public class HtmlParser implements Parse
   }
 
   @Override
-  public Parse parse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
+  public Parse parse(ContentEntity entity, Task newLink) throws DroidsException, IOException {
     // setup filter chain
     XMLDocumentFilter[] filters = { getRemover() };
     // create HTML parser
     SAXParser parser = getParser(filters);
-    LinkExtractor linkExtractor = new LinkExtractor(newLink, elements);
+    LinkExtractor linkExtractor = new LinkExtractor((Link)newLink, elements);
     parser.setContentHandler(linkExtractor);
     InputStream instream = entity.obtainContent();
     try {

Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Fri Dec  2 02:30:10 2011
@@ -36,6 +36,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.droids.LinkTask;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
+import org.apache.droids.api.Task;
 import org.apache.droids.exception.DroidsException;
 import org.apache.droids.tika.api.TikaParse;
 import org.apache.droids.tika.api.TikaParser;
@@ -55,7 +56,7 @@ public class TikaDocumentParser implemen
   protected static final Log log = LogFactory.getLog(TikaDocumentParser.class);
   
   @Override
-  public TikaParse parse(ContentEntity entity, Link link) throws DroidsException,
+  public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
       IOException {
     // Init Tika objects
     org.apache.tika.parser.Parser parser = new AutoDetectParser();
@@ -91,22 +92,23 @@ public class TikaDocumentParser implemen
       parser.parse(instream, parallelHandler, metadata, new ParseContext());
       
       ArrayList<Link> extractedTasks = new ArrayList<Link>();
-      int depth = link.getDepth() + 1;
-      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
-        try {
-          extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
-        } catch (URISyntaxException e) {
-          if(log.isWarnEnabled()) {
-            log.warn("URI not valid: "+ tikaLink.getUri());
-          }
-        }
+      int depth = task.getDepth() + 1;
+      if (task instanceof LinkTask) {
+	      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+	        try {
+	          extractedTasks.add(new LinkTask((LinkTask)task, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+	        } catch (URISyntaxException e) {
+	          if(log.isWarnEnabled()) {
+	            log.warn("URI not valid: "+ tikaLink.getUri());
+	          }
+	        }
+	      }
       }
-      
       return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
     } catch (SAXException ex) {
-      throw new DroidsException("Failure parsing document " + link.getId(), ex);
+      throw new DroidsException("Failure parsing document " + task.getId(), ex);
     } catch (TikaException ex) {
-      throw new DroidsException("Failure parsing document " + link.getId(), ex);
+      throw new DroidsException("Failure parsing document " + task.getId(), ex);
     } finally {
       instream.close();
     } 

Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Fri Dec  2 02:30:10 2011
@@ -34,6 +34,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.droids.LinkTask;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
+import org.apache.droids.api.Task;
 import org.apache.droids.exception.DroidsException;
 import org.apache.droids.tika.api.TikaParse;
 import org.apache.droids.tika.api.TikaParser;
@@ -59,7 +60,7 @@ public class TikaHtmlParser implements T
   protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
 
   @Override
-  public TikaParse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
+  public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
     // Init Tika objects
     org.apache.tika.parser.Parser parser = new AutoDetectParser();
     Metadata metadata = new Metadata();
@@ -94,22 +95,23 @@ public class TikaHtmlParser implements T
       parser.parse(instream, parallelHandler, metadata, new ParseContext());
       
       ArrayList<Link> extractedTasks = new ArrayList<Link>();
-      int depth = link.getDepth() + 1;
-      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
-        try {
-          extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
-        } catch (URISyntaxException e) {
-          if(log.isWarnEnabled()) {
-            log.warn("URI not valid: "+ tikaLink.getUri());
-          }
-        }
+      if (task instanceof Link) {
+	      int depth = task.getDepth() + 1;
+	      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+	        try {
+	          extractedTasks.add(new LinkTask((Link)task, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+	        } catch (URISyntaxException e) {
+	          if(log.isWarnEnabled()) {
+	            log.warn("URI not valid: "+ tikaLink.getUri());
+	          }
+	        }
+	      }
       }
-      
       return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
     } catch (SAXException ex) {
-      throw new DroidsException("Failure parsing document " + link.getId(), ex);
+      throw new DroidsException("Failure parsing document " + task.getId(), ex);
     } catch (TikaException ex) {
-      throw new DroidsException("Failure parsing document " + link.getId(), ex);
+      throw new DroidsException("Failure parsing document " + task.getId(), ex);
     } finally {
       instream.close();
     }