You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by rf...@apache.org on 2011/12/02 02:30:13 UTC
svn commit: r1209332 - in /incubator/droids/trunk:
droids-core/src/main/java/org/apache/droids/api/
droids-core/src/main/java/org/apache/droids/parse/html/
droids-tika/src/main/java/org/apache/droids/tika/
Author: rfrovarp
Date: Fri Dec 2 02:30:10 2011
New Revision: 1209332
URL: http://svn.apache.org/viewvc?rev=1209332&view=rev
Log:
Apply the patch from DROIDS-158.
Thank you to Tobias Rübner for reporting and providing a patch.
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/api/Parser.java Fri Dec 2 02:30:10 2011
@@ -33,9 +33,9 @@ public interface Parser {
*
* @param entity
* the underlying stream we are using
- * @param link
- * the link that correspond to the stream
+ * @param task
+ * the task that correspond to the stream
* @return the parse object
*/
- Parse parse(ContentEntity entity, Link link) throws DroidsException, IOException;
+ Parse parse(ContentEntity entity, Task task) throws DroidsException, IOException;
}
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Fri Dec 2 02:30:10 2011
@@ -27,6 +27,7 @@ import org.apache.droids.api.ContentEnti
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
+import org.apache.droids.api.Task;
import org.apache.droids.exception.ContentFormatViolationException;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.parse.ParseImpl;
@@ -65,12 +66,12 @@ public class HtmlParser implements Parse
}
@Override
- public Parse parse(ContentEntity entity, Link newLink) throws DroidsException, IOException {
+ public Parse parse(ContentEntity entity, Task newLink) throws DroidsException, IOException {
// setup filter chain
XMLDocumentFilter[] filters = { getRemover() };
// create HTML parser
SAXParser parser = getParser(filters);
- LinkExtractor linkExtractor = new LinkExtractor(newLink, elements);
+ LinkExtractor linkExtractor = new LinkExtractor((Link)newLink, elements);
parser.setContentHandler(linkExtractor);
InputStream instream = entity.obtainContent();
try {
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaDocumentParser.java Fri Dec 2 02:30:10 2011
@@ -36,6 +36,7 @@ import org.apache.commons.logging.LogFac
import org.apache.droids.LinkTask;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
+import org.apache.droids.api.Task;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.tika.api.TikaParse;
import org.apache.droids.tika.api.TikaParser;
@@ -55,7 +56,7 @@ public class TikaDocumentParser implemen
protected static final Log log = LogFactory.getLog(TikaDocumentParser.class);
@Override
- public TikaParse parse(ContentEntity entity, Link link) throws DroidsException,
+ public TikaParse parse(ContentEntity entity, Task task) throws DroidsException,
IOException {
// Init Tika objects
org.apache.tika.parser.Parser parser = new AutoDetectParser();
@@ -91,22 +92,23 @@ public class TikaDocumentParser implemen
parser.parse(instream, parallelHandler, metadata, new ParseContext());
ArrayList<Link> extractedTasks = new ArrayList<Link>();
- int depth = link.getDepth() + 1;
- for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
- try {
- extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
- } catch (URISyntaxException e) {
- if(log.isWarnEnabled()) {
- log.warn("URI not valid: "+ tikaLink.getUri());
- }
- }
+ int depth = task.getDepth() + 1;
+ if (task instanceof LinkTask) {
+ for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ extractedTasks.add(new LinkTask((LinkTask)task, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if(log.isWarnEnabled()) {
+ log.warn("URI not valid: "+ tikaLink.getUri());
+ }
+ }
+ }
}
-
return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
} catch (SAXException ex) {
- throw new DroidsException("Failure parsing document " + link.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
} catch (TikaException ex) {
- throw new DroidsException("Failure parsing document " + link.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
} finally {
instream.close();
}
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1209332&r1=1209331&r2=1209332&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Fri Dec 2 02:30:10 2011
@@ -34,6 +34,7 @@ import org.apache.commons.logging.LogFac
import org.apache.droids.LinkTask;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
+import org.apache.droids.api.Task;
import org.apache.droids.exception.DroidsException;
import org.apache.droids.tika.api.TikaParse;
import org.apache.droids.tika.api.TikaParser;
@@ -59,7 +60,7 @@ public class TikaHtmlParser implements T
protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
@Override
- public TikaParse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
+ public TikaParse parse(ContentEntity entity, Task task) throws IOException, DroidsException {
// Init Tika objects
org.apache.tika.parser.Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
@@ -94,22 +95,23 @@ public class TikaHtmlParser implements T
parser.parse(instream, parallelHandler, metadata, new ParseContext());
ArrayList<Link> extractedTasks = new ArrayList<Link>();
- int depth = link.getDepth() + 1;
- for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
- try {
- extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
- } catch (URISyntaxException e) {
- if(log.isWarnEnabled()) {
- log.warn("URI not valid: "+ tikaLink.getUri());
- }
- }
+ if (task instanceof Link) {
+ int depth = task.getDepth() + 1;
+ for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ extractedTasks.add(new LinkTask((Link)task, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if(log.isWarnEnabled()) {
+ log.warn("URI not valid: "+ tikaLink.getUri());
+ }
+ }
+ }
}
-
return new TikaParseImpl(dataBuffer.toString(), extractedTasks, bodyBuffer.toString(), mainContentBuffer.toString(), metadata);
} catch (SAXException ex) {
- throw new DroidsException("Failure parsing document " + link.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
} catch (TikaException ex) {
- throw new DroidsException("Failure parsing document " + link.getId(), ex);
+ throw new DroidsException("Failure parsing document " + task.getId(), ex);
} finally {
instream.close();
}