You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2013/01/30 15:30:36 UTC

svn commit: r1440466 - in /incubator/droids/branches/0.2.x-cleanup: droids-core/src/main/java/org/apache/droids/core/ droids-crawler/src/main/java/org/apache/droids/crawler/ droids-crawler/src/main/java/org/apache/droids/fetcher/ droids-crawler/src/tes...

Author: thorsten
Date: Wed Jan 30 15:30:35 2013
New Revision: 1440466

URL: http://svn.apache.org/viewvc?rev=1440466&view=rev
Log:
Bring back link interface and refactoring code

Added:
    incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java
      - copied, changed from r1440461, incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
Removed:
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
Modified:
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
    incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java

Added: incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java?rev=1440466&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java Wed Jan 30 15:30:35 2013
@@ -0,0 +1,53 @@
+package org.apache.droids.core;
+/*
+
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.net.URI;
+import java.util.Collection;
+
+import org.apache.droids.core.Task;
+
+
+/**
+ * Simple extension of a {@link Task}. Adding from/to link, anchor text
+ * and the last modified attribute to the task object.
+ *
+ * @version 1.0
+ */
+public interface LinkedTask extends Task {
+    /**
+     * @return the Anchor text for this link
+     */
+    String getAnchorText();
+
+    /**
+     * From where the link was created.
+     *
+     * @return the parent link from where the link was coming from
+     */
+    LinkedTask getFrom();
+
+    /**
+     * To where the link is pointing to.
+     *
+     * @return the location where the link is pointing to
+     */
+    Collection<URI> getTo();
+
+}

Copied: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java (from r1440461, incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java&r1=1440461&r2=1440466&rev=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java Wed Jan 30 15:30:35 2013
@@ -17,9 +17,11 @@
 package org.apache.droids.crawler;
 
 import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.core.Task;
 
 import java.net.URI;
+import java.util.Collection;
 import java.util.Date;
 import java.io.Serializable;
 
@@ -28,14 +30,17 @@ import java.io.Serializable;
  * Basic implementation for Crawler @Task. LinkTasks are working instructions for URI
  * based droids.
  */
-public class LinkTask implements Task, Serializable {
+public class CrawlerTask implements LinkedTask, Serializable {
     private static final long serialVersionUID = -44808094386453088L;
 
     private ContentEntity contentEntity;
     private Date started;
     private final int depth;
     private final URI uri;
-
+    private final LinkedTask from;
+    private Collection<URI> linksTo;
+    private String anchorText;
+    private int weight;
     private boolean aborted = false;
 
     /**
@@ -43,8 +48,8 @@ public class LinkTask implements Task, S
      *
      * @param uri The URI of the task.
      */
-    public LinkTask(URI uri) {
-        this(uri, 0);
+    public CrawlerTask(URI uri) {
+        this(null, uri, 0);
     }
 
     /**
@@ -53,10 +58,42 @@ public class LinkTask implements Task, S
      * @param uri The URI of the task.
      * @param depth The depth of the task.
      */
-    public LinkTask(URI uri, int depth) {
+    public CrawlerTask(LinkedTask from, URI uri, int depth) {
+        this.from = from;
+        this.uri = uri;
+        this.depth = depth;
+        this.started = new Date();
+        this.contentEntity = new ContentEntity();
+    }
+
+    /**
+     * Creates a new LinkTask.
+     *
+     * @param from Link
+     * @param uri URI
+     * @param depth int
+     * @param weight int
+     */
+    public CrawlerTask(LinkedTask from, URI uri, int depth, int weight) {
+        this.from = from;
         this.uri = uri;
         this.depth = depth;
         this.started = new Date();
+        this.weight = weight;
+        this.contentEntity = new ContentEntity();
+    }
+
+    /**
+     * Creates a new LinkTask.
+     *
+     * @param from Link
+     * @param uri URI
+     * @param depth int
+     * @param anchorText String
+     */
+    public CrawlerTask(LinkedTask from, URI uri, int depth, String anchorText) {
+        this(from, uri, depth);
+        this.anchorText = anchorText;
         this.contentEntity = new ContentEntity();
     }
 
@@ -76,11 +113,6 @@ public class LinkTask implements Task, S
     }
 
     @Override
-    public int getDepth() {
-        return depth;
-    }
-
-    @Override
     public void abort() {
         aborted = true;
     }
@@ -92,11 +124,49 @@ public class LinkTask implements Task, S
 
     @Override
     public Task createTask(URI uri) {
-        return new LinkTask(uri, this.getDepth());
+        return new CrawlerTask(this, uri, this.getDepth() + 1);
     }
 
     @Override
     public String toString() {
         return "(" + getURI().toString() + "," + getDepth() + ")";
     }
+    @Override
+    public int getDepth() {
+        return depth;
+    }
+
+    @Override
+    public LinkedTask getFrom() {
+        return from;
+    }
+
+    @Override
+    public Collection<URI> getTo() {
+        return linksTo;
+    }
+
+    /**
+     * Set Outgoing links.
+     *
+     * @param linksTo Collection<URI>
+     */
+    public void setLinksTo(Collection<URI> linksTo) {
+        this.linksTo = linksTo;
+    }
+
+    @Override
+    public String getAnchorText() {
+        return anchorText;
+    }
+
+    /**
+     * Set the anchor text for this link.
+     *
+     * @param anchorText String
+     */
+    public void setAnchorText(String anchorText) {
+        this.anchorText = anchorText;
+    }
+
 }
\ No newline at end of file

Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java Wed Jan 30 15:30:35 2013
@@ -21,13 +21,14 @@ import java.util.Collection;
 import java.util.Queue;
 
 import org.apache.droids.core.AbstractDroid;
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.core.TaskMaster;
 import org.apache.droids.core.Worker;
 import org.apache.droids.fetcher.CrawlingFetcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public abstract class CrawlingDroid extends AbstractDroid<LinkTask> {
+public abstract class CrawlingDroid extends AbstractDroid<LinkedTask> {
     protected final Logger logger = LoggerFactory.getLogger(CrawlingDroid.class);
     protected Collection<String> initialLocations;
 
@@ -42,7 +43,7 @@ public abstract class CrawlingDroid exte
      * @param queue Queue<Link>
      * @param taskMaster TaskMaster<Link>
      */
-    public CrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
+    public CrawlingDroid(Queue<LinkedTask> queue, TaskMaster<LinkedTask> taskMaster) {
         super(queue, taskMaster);
         this.setFetcher(new CrawlingFetcher());
     }
@@ -55,6 +56,6 @@ public abstract class CrawlingDroid exte
      * @see org.apache.droids.core.Droid#getNewWorker()
      * @return Worker<Link>
      */
-    public abstract Worker<LinkTask> getNewWorker();
+    public abstract Worker<LinkedTask> getNewWorker();
 
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java Wed Jan 30 15:30:35 2013
@@ -17,6 +17,7 @@
 package org.apache.droids.crawler;
 
 import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.core.Worker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.util.Set;
 
-public class CrawlingWorker implements Worker<LinkTask> {
+public class CrawlingWorker implements Worker<LinkedTask> {
 
     private static final Logger logger = LoggerFactory.getLogger(CrawlingWorker.class);
 
@@ -35,7 +36,7 @@ public class CrawlingWorker implements W
     }
 
     @Override
-    public void execute(LinkTask task) throws DroidsException, IOException {
+    public void execute(LinkedTask task) throws DroidsException, IOException {
         if (logger.isInfoEnabled()) {
             logger.info("Loading " + task.getURI());
         }
@@ -43,9 +44,9 @@ public class CrawlingWorker implements W
         droid.parse(task);
 
         // add this to a link handler
-        Set<LinkTask> links = task.getContentEntity().getLinks();
+        Set<LinkedTask> links = task.getContentEntity().getLinks();
         if (links != null) {
-            for (LinkTask outLink : links) {
+            for (LinkedTask outLink : links) {
                 if (droid.filter(outLink) != null) {
                     droid.add(outLink);
                 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java Wed Jan 30 15:30:35 2013
@@ -23,6 +23,8 @@ import java.net.URISyntaxException;
 import java.util.Queue;
 
 import com.google.common.base.Preconditions;
+
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.core.TaskMaster;
 import org.apache.droids.core.Worker;
 
@@ -31,7 +33,7 @@ import org.apache.droids.core.Worker;
  */
 public class SimpleCrawlingDroid extends CrawlingDroid {
 
-    public SimpleCrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
+    public SimpleCrawlingDroid(Queue<LinkedTask> queue, TaskMaster<LinkedTask> taskMaster) {
         super(queue, taskMaster);
     }
 
@@ -42,7 +44,7 @@ public class SimpleCrawlingDroid extends
         for (String location : initialLocations) {
             try {
                 URI uri = new URI(location);
-                queue.offer(new LinkTask(uri));
+                queue.offer(new CrawlerTask(uri));
             } catch (URISyntaxException ex) {
                 logger.error(ex.getMessage());
             }
@@ -51,7 +53,7 @@ public class SimpleCrawlingDroid extends
     }
 
     @Override
-    public Worker<LinkTask> getNewWorker() {
+    public Worker<LinkedTask> getNewWorker() {
         return new CrawlingWorker(this);
     }
 }

Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java Wed Jan 30 15:30:35 2013
@@ -1,7 +1,7 @@
 package org.apache.droids.fetcher;
 
 import org.apache.droids.core.Fetcher;
-import org.apache.droids.crawler.LinkTask;
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.norobots.ContentLoader;
 import org.apache.droids.norobots.HttpClientContentLoader;
 import org.apache.droids.norobots.NoRobotClient;
@@ -26,7 +26,7 @@ import java.net.URISyntaxException;
  *
  *
  */
-public class CrawlingFetcher implements Fetcher<LinkTask> {
+public class CrawlingFetcher implements Fetcher<LinkedTask> {
     private boolean forceAllow;
     private String userAgent;
     private final HttpClient httpClient;
@@ -46,7 +46,7 @@ public class CrawlingFetcher implements 
     }
 
     @Override
-    public boolean isAllowed(LinkTask task) throws IOException {
+    public boolean isAllowed(LinkedTask task) throws IOException {
         if (forceAllow) {
             return forceAllow;
         }
@@ -77,7 +77,7 @@ public class CrawlingFetcher implements 
     }
 
     @Override
-    public void fetch(LinkTask task) throws IOException {
+    public void fetch(LinkedTask task) throws IOException {
         HttpGet httpget = new HttpGet(task.getURI());
         HttpResponse response = httpClient.execute(httpget);
         StatusLine statusline = response.getStatusLine();

Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java Wed Jan 30 15:30:35 2013
@@ -1,14 +1,11 @@
 package org.apache.droids.crawler;
 
-import junit.framework.Assert;
-import org.apache.droids.core.Droid;
+import org.apache.droids.core.LinkedTask;
 import org.apache.droids.core.SimpleTaskQueueWithHistory;
 import org.apache.droids.core.TaskMaster;
 import org.apache.droids.crawler.localserver.LocalHttpServer;
 import org.apache.droids.crawler.localserver.ResourceHandler;
-import org.apache.droids.filter.HostFilter;
 import org.apache.droids.handle.ReportHandler;
-import org.apache.droids.handle.SysoutHandler;
 import org.apache.droids.parse.SimpleLinkParser;
 import org.apache.droids.taskmaster.SequentialTaskMaster;
 import static org.junit.Assert.*;
@@ -47,8 +44,8 @@ public class CrawlingDroidTest {
         String targetURI = baseURI + "/start_html";
 
 
-        Queue<LinkTask> queue = new SimpleTaskQueueWithHistory<LinkTask>();
-        TaskMaster<LinkTask> taskMaster = new SequentialTaskMaster<LinkTask>();
+        Queue<LinkedTask> queue = new SimpleTaskQueueWithHistory<LinkedTask>();
+        TaskMaster<LinkedTask> taskMaster = new SequentialTaskMaster<LinkedTask>();
 
         Collection<String> initialLocations = new LinkedList<String>();
         initialLocations.add(targetURI);
@@ -56,7 +53,7 @@ public class CrawlingDroidTest {
 
         SimpleCrawlingDroid droid = new SimpleCrawlingDroid(queue, taskMaster);
         droid.setInitialLocations(initialLocations);
-        droid.addParsers(new SimpleLinkParser());
+        droid.addParsers(new SimpleLinkParser<LinkedTask>());
 
 
         // just output the filename