You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by th...@apache.org on 2008/08/26 14:53:17 UTC

svn commit: r689051 - in /labs/droids/trunk/src/core/java/org/apache/droids: DefaultWorker.java droids-core-context.xml droids-core-factories-context.xml parse/html/HtmlParser.java

Author: thorsten
Date: Tue Aug 26 05:53:16 2008
New Revision: 689051

URL: http://svn.apache.org/viewvc?rev=689051&view=rev
Log:
Enhancing the default crawler to save as well images and alike

Modified:
    labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java
    labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml
    labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml
    labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java

Modified: labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java Tue Aug 26 05:53:16 2008
@@ -62,21 +62,23 @@
         sleep(delayTimer.getDelayMillis());
       }
       setUri(link.getId());
-      Core.threadMessage("uri " + getUri());
-      setProtocol(protocolFactory.getProtocol(getUri()));
-      if (getProtocol().isAllowed(getUri())) {
-        String contentType = getProtocol().getContentType(getUri());
+      String url = getUri();
+      Core.threadMessage("url " + url);
+      setProtocol(protocolFactory.getProtocol(url));
+      long workerId = getId();
+      if (getProtocol().isAllowed(url)) {
+        String contentType = getProtocol().getContentType(url);
         Core.threadMessage("contentType " + contentType);
         parser = parserFactory.getParser(contentType);
         // parse contains the outlinks and can be used later
         Parse parse = getParse();
         handle(parse);
         //Core.threadMessage("Trying to shut down "+getId());
-        getDroid().finishedWorker(getId());
+        getDroid().finishedWorker(workerId);
       } else {
         Core.threadMessage("stopping processing since"
             + " bots are not allowed for this url.");
-        getDroid().finishedWorker(getId());
+        getDroid().finishedWorker(workerId);
       }
     } catch (Exception e) {
       e.printStackTrace();
@@ -85,8 +87,8 @@
   }
 
   protected void handle(Parse parse) throws MalformedURLException, IOException {
-    if (null != parse)
-      handlerFactory.handle(getProtocol().openStream(getUri()), new URL(getUri()), parse);
+      String url = getUri();
+      handlerFactory.handle(getProtocol().openStream(url), new URL(url), parse);
   }
 
   protected Parse getParse() {

Modified: labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml Tue Aug 26 05:53:16 2008
@@ -14,7 +14,7 @@
 <beans xmlns="http://www.springframework.org/schema/beans"
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
   xmlns:configurator="http://cocoon.apache.org/schema/configurator"
-  xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.0.xsd
+  xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
        http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.0.1.xsd">
   
   <import resource="droids-core-factories-context.xml"/>

Modified: labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml Tue Aug 26 05:53:16 2008
@@ -3,7 +3,7 @@
 <beans xmlns="http://www.springframework.org/schema/beans"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xmlns:configurator="http://cocoon.apache.org/schema/configurator"
-       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.0.xsd
+       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
        http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.0.1.xsd">
   
   <!-- Core -  factories register -->

Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java Tue Aug 26 05:53:16 2008
@@ -53,6 +53,8 @@
 
   public Parse getParse(InputStream stream, Task newLink) {
     this.link = newLink;
+    final String id = link.getId();
+    System.out.println("id: "+id);
     try {
       this.base = new URL(newLink.getId());
     } catch (MalformedURLException e1) {
@@ -60,10 +62,10 @@
     }
     ParseData parseData = null;
     // setup filter chain
-    XMLDocumentFilter[] filters = { getRemover() };
+    final XMLDocumentFilter[] filters = { getRemover() };
     // create HTML parser
-    DOMFragmentParser parser = getParser(filters);
-    DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+    final DOMFragmentParser parser = getParser(filters);
+    final DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
     // parse document
     // XMLInputSource source = new XMLInputSource(null, uri, uri);
     try {
@@ -77,7 +79,7 @@
   }
 
   private ParseData extract(DocumentFragment node) {
-    ArrayList<Outlink> links = new ArrayList<Outlink>();
+    final ArrayList<Outlink> links = new ArrayList<Outlink>();
     try {
       extractLinks(node, links, new HashSet<String>());
     } catch (MalformedURLException e) {
@@ -89,7 +91,7 @@
   }
 
   private DOMFragmentParser getParser(XMLDocumentFilter[] filters) {
-    DOMFragmentParser parser = new DOMFragmentParser();
+    final DOMFragmentParser parser = new DOMFragmentParser();
     try {
       parser.setProperty("http://cyberneko.org/html/properties/filters",
           filters);
@@ -112,9 +114,12 @@
 
   private static ElementRemover getRemover() {
     // create element remover filter
-    ElementRemover remover = new ElementRemover();
+    final ElementRemover remover = new ElementRemover();
     // set which elements to accept
     remover.acceptElement("a", new String[] { "href" });
+    remover.acceptElement("link", new String[] { "href" });
+    remover.acceptElement("img", new String[] { "src" });
+    remover.acceptElement("script", new String[] { "src" });
     // completely remove some elements
     remover.removeElement("script");
     remover.removeElement("head");
@@ -124,13 +129,14 @@
   private void extractLinks(Node node, ArrayList<Outlink> links,
       HashSet<String> set) throws MalformedURLException {
     if (node.getNodeType() == Node.ELEMENT_NODE) {
-      if ("a".equalsIgnoreCase(node.getNodeName())) {
+      if ("a".equalsIgnoreCase(node.getNodeName())||"img".equalsIgnoreCase(node.getNodeName())
+          ||"link".equalsIgnoreCase(node.getNodeName())||"script".equalsIgnoreCase(node.getNodeName()) ) {
         NamedNodeMap attrs = node.getAttributes();
         String target = null;
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
           String attrName = attr.getNodeName();
-          if (attrName.equalsIgnoreCase("href")) {
+          if (attrName.equalsIgnoreCase("href")||attrName.equalsIgnoreCase("src")) {
             target = attr.getNodeValue();
             try {
               String newUrl = "";
@@ -139,17 +145,19 @@
                 if(base.getPort()>-1){
                   newUrl+=":"+base.getPort();
                 }
-              }else{
+              }else if(!target.toLowerCase().startsWith("javascript")){
                 newUrl=new URL(base, target).toString();
               }
-              final Outlink outlink = new Outlink(
-                  target.contains(":/") ? target : newUrl, link.getDepth() + 1);
-              log.debug("set size: "+set.size());
-              log.debug("outlink.getToUrl(): "+outlink.getToUrl());
-              log.debug("set.contains(outlink.getToUrl(): "+set.contains(outlink.getToUrl()));
-              if (!set.contains(outlink.getToUrl())) {
-                set.add(outlink.getToUrl());
-                links.add(outlink);
+              if (!newUrl.equals("")) {
+                final Outlink outlink = new Outlink(
+                    target.contains(":/") ? target : newUrl, link.getDepth() + 1);
+                log.debug("set size: "+set.size());
+                log.debug("outlink.getToUrl(): "+outlink.getToUrl());
+                log.debug("set.contains(outlink.getToUrl(): "+set.contains(outlink.getToUrl()));
+                if (!set.contains(outlink.getToUrl())) {
+                  set.add(outlink.getToUrl());
+                  links.add(outlink);
+                }
               }
             } catch (Exception e) {
               log.fatal(e);
@@ -158,7 +166,7 @@
         }
       }
     }
-    NodeList children = node.getChildNodes();
+    final NodeList children = node.getChildNodes();
     if (children != null) {
       int len = children.getLength();
       for (int i = 0; i < len; i++) {



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org