You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by rf...@apache.org on 2011/11/23 04:37:04 UTC

svn commit: r1205284 - /incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java

Author: rfrovarp
Date: Wed Nov 23 04:37:03 2011
New Revision: 1205284

URL: http://svn.apache.org/viewvc?rev=1205284&view=rev
Log:
Use Tika's module to extract the links instead of ours. Add the other handlers for future use.

Modified:
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java

Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java?rev=1205284&r1=1205283&r2=1205284&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/TikaHtmlParser.java Wed Nov 23 04:37:03 2011
@@ -19,8 +19,9 @@ package org.apache.droids.tika;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
-import java.util.HashMap;
-import java.util.Map;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.TransformerConfigurationException;
@@ -28,10 +29,12 @@ import javax.xml.transform.sax.SAXTransf
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.LinkTask;
 import org.apache.droids.api.ContentEntity;
 import org.apache.droids.api.Link;
 import org.apache.droids.exception.DroidsException;
-import org.apache.droids.parse.html.LinkExtractor;
 import org.apache.droids.tika.api.TikaParse;
 import org.apache.droids.tika.api.TikaParser;
 import org.apache.droids.tika.parse.TikaParseImpl;
@@ -39,23 +42,15 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
 import org.xml.sax.SAXException;
 
 public class TikaHtmlParser implements TikaParser {
 
-  private Map<String, String> elements= null;
-
-  public Map<String, String> getElements() {
-    if (elements == null) {
-      elements = new HashMap<String, String>();
-    }
-    return elements;
-  }
-
-  public void setElements(Map<String, String> elements) {
-    this.elements = elements;
-  }
+  protected static final Log log = LogFactory.getLog(TikaHtmlParser.class);
 
   @Override
   public TikaParse parse(ContentEntity entity, Link link) throws IOException, DroidsException {
@@ -69,6 +64,8 @@ public class TikaHtmlParser implements T
     }
     
     StringWriter dataBuffer = new StringWriter();
+    StringWriter bodyBuffer = new StringWriter();
+    StringWriter mainContentBuffer = new StringWriter();
      
     SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
     TransformerHandler xmlHandler;
@@ -80,15 +77,29 @@ public class TikaHtmlParser implements T
     xmlHandler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
     xmlHandler.setResult(new StreamResult(dataBuffer));
     
-    LinkExtractor extractor = new LinkExtractor(link, elements);
+    BoilerpipeContentHandler mainContentHandler = new BoilerpipeContentHandler(mainContentBuffer);
+    BodyContentHandler bodyHandler = new BodyContentHandler(bodyBuffer);
+    LinkContentHandler linkHandler = new LinkContentHandler();
     
-    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, extractor);
+    TeeContentHandler parallelHandler = new TeeContentHandler(xmlHandler, mainContentHandler, bodyHandler, linkHandler );
 
     InputStream instream = entity.obtainContent();
     try {
       parser.parse(instream, parallelHandler, metadata, new ParseContext());
       
-      return new TikaParseImpl(dataBuffer.toString(), extractor.getLinks());
+      ArrayList<Link> extractedTasks = new ArrayList<Link>();
+      int depth = link.getDepth() + 1;
+      for(org.apache.tika.sax.Link tikaLink : linkHandler.getLinks()) {
+        try {
+          extractedTasks.add(new LinkTask(link, new URI(tikaLink.getUri()), depth, tikaLink.getText()));
+        } catch (URISyntaxException e) {
+          if(log.isWarnEnabled()) {
+            log.warn("URI not valid: "+ tikaLink.getUri());
+          }
+        }
+      }
+      
+      return new TikaParseImpl(dataBuffer.toString(), extractedTasks);
     } catch (SAXException ex) {
       throw new DroidsException("Failure parsing document " + link.getId(), ex);
     } catch (TikaException ex) {