You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by rf...@apache.org on 2011/11/23 04:37:04 UTC
svn commit: r1205284 -
/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Author: rfrovarp
Date: Wed Nov 23 04:37:03 2011
New Revision: 1205284
URL: http://svn.apache.org/viewvc?rev=1205284&view=rev
Log:
Use Tika's module to extract the links instead of ours. Add the other handlers for future use.
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1205284&r1=1205283&r2=1205284&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Wed Nov 23 04:37:03 2011
@@ -19,8 +19,9 @@ package org.apache.droids.tika;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
-import java.util.HashMap;
-import java.util.Map;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerConfigurationException;
@@ -28,10 +29,12 @@ import javax.xml.transform.sax.SAXTransf
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.LinkTask;
import org.apache.droids.api.ContentEntity;
import org.apache.droids.api.Link;
import org.apache.droids.exception.DroidsException;
-import org.apache.droids.parse.html.LinkExtractor;
import org.apache.droids.tika.api.TikaParse;
import org.apache.droids.tika.api.TikaParser;
import org.apache.droids.tika.parse.TikaParseImpl;
@@ -39,23 +42,15 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.xml.sax.SAXException;
public class TikaHtmlParser implements TikaParser {
- private Map<String, String> elements= null;
-
- public Map<String, String> getElements() {
- if (elements == null) {
- elements = new HashMap<String, String>();
- }
- return elements;
- }
-
- public void setElements(Map<String, String> elements) {
- this.elements = elements;
- }
+ protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
@Override
public TikaParse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
@@ -69,6 +64,8 @@ public class TikaHtmlParser implements T
}
StringWriter dataBuffer = new StringWriter();
+ StringWriter bodyBuffer = new StringWriter();
+ StringWriter mainContentBuffer = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler xmlHandler;
@@ -80,15 +77,29 @@ public class TikaHtmlParser implements T
xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
xmlHandler.setResult(new StreamResult(dataBuffer));
- LinkExtractor extractor = new LinkExtractor(link, elements);
+ BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+ BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+ LinkContentHandler linkHandler = new LinkContentHandler();
- TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, extractor);
+ TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
InputStream instream = entity.obtainContent();
try {
parser.parse(instream, parallelHandler, metadata, new ParseContext());
- return new TikaParseImpl(dataBuffer.toString(), extractor.getLinks());
+ ArrayList<Link> extractedTasks = new ArrayList<Link>();
+ int depth = link.getDepth() + 1;
+ for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+ try {
+ extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+ } catch (URISyntaxException e) {
+ if(log.isWarnEnabled()) {
+ log.warn("URI not valid: "+ tikaLink.getUri());
+ }
+ }
+ }
+
+ return new TikaParseImpl(dataBuffer.toString(), extractedTasks);
} catch (SAXException ex) {
throw new DroidsException("Failure parsing document " + link.getId(), ex);
} catch (TikaException ex) {