You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2013/01/30 15:30:36 UTC
svn commit: r1440466 - in /incubator/droids/branches/0.2.x-cleanup:
droids-core/src/main/java/org/apache/droids/core/
droids-crawler/src/main/java/org/apache/droids/crawler/
droids-crawler/src/main/java/org/apache/droids/fetcher/
droids-crawler/src/tes...
Author: thorsten
Date: Wed Jan 30 15:30:35 2013
New Revision: 1440466
URL: http://svn.apache.org/viewvc?rev=1440466&view=rev
Log:
Bring back link interface and refactoring code
Added:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java
- copied, changed from r1440461, incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
Removed:
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
Added: incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java?rev=1440466&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/LinkedTask.java Wed Jan 30 15:30:35 2013
@@ -0,0 +1,53 @@
+package org.apache.droids.core;
+/*
+
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.net.URI;
+import java.util.Collection;
+
+import org.apache.droids.core.Task;
+
+
+/**
+ * Simple extension of a {@link Task}. Adding from/to link, anchor text
+ * and the last modified attribute to the task object.
+ *
+ * @version 1.0
+ */
+public interface LinkedTask extends Task {
+ /**
+ * @return the Anchor text for this link
+ */
+ String getAnchorText();
+
+ /**
+ * From where the link was created.
+ *
+ * @return the parent link from where the link was coming from
+ */
+ LinkedTask getFrom();
+
+ /**
+ * To where the link is pointing to.
+ *
+ * @return the location where the link is pointing to
+ */
+ Collection<URI> getTo();
+
+}
Copied: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java (from r1440461, incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java)
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java&r1=1440461&r2=1440466&rev=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerTask.java Wed Jan 30 15:30:35 2013
@@ -17,9 +17,11 @@
package org.apache.droids.crawler;
import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.Task;
import java.net.URI;
+import java.util.Collection;
import java.util.Date;
import java.io.Serializable;
@@ -28,14 +30,17 @@ import java.io.Serializable;
* Basic implementation for Crawler @Task. LinkTasks are working instructions for URI
* based droids.
*/
-public class LinkTask implements Task, Serializable {
+public class CrawlerTask implements LinkedTask, Serializable {
private static final long serialVersionUID = -44808094386453088L;
private ContentEntity contentEntity;
private Date started;
private final int depth;
private final URI uri;
-
+ private final LinkedTask from;
+ private Collection<URI> linksTo;
+ private String anchorText;
+ private int weight;
private boolean aborted = false;
/**
@@ -43,8 +48,8 @@ public class LinkTask implements Task, S
*
* @param uri The URI of the task.
*/
- public LinkTask(URI uri) {
- this(uri, 0);
+ public CrawlerTask(URI uri) {
+ this(null, uri, 0);
}
/**
@@ -53,10 +58,42 @@ public class LinkTask implements Task, S
* @param uri The URI of the task.
* @param depth The depth of the task.
*/
- public LinkTask(URI uri, int depth) {
+ public CrawlerTask(LinkedTask from, URI uri, int depth) {
+ this.from = from;
+ this.uri = uri;
+ this.depth = depth;
+ this.started = new Date();
+ this.contentEntity = new ContentEntity();
+ }
+
+ /**
+ * Creates a new LinkTask.
+ *
+ * @param from Link
+ * @param uri URI
+ * @param depth int
+ * @param weight int
+ */
+ public CrawlerTask(LinkedTask from, URI uri, int depth, int weight) {
+ this.from = from;
this.uri = uri;
this.depth = depth;
this.started = new Date();
+ this.weight = weight;
+ this.contentEntity = new ContentEntity();
+ }
+
+ /**
+ * Creates a new LinkTask.
+ *
+ * @param from Link
+ * @param uri URI
+ * @param depth int
+ * @param anchorText String
+ */
+ public CrawlerTask(LinkedTask from, URI uri, int depth, String anchorText) {
+ this(from, uri, depth);
+ this.anchorText = anchorText;
this.contentEntity = new ContentEntity();
}
@@ -76,11 +113,6 @@ public class LinkTask implements Task, S
}
@Override
- public int getDepth() {
- return depth;
- }
-
- @Override
public void abort() {
aborted = true;
}
@@ -92,11 +124,49 @@ public class LinkTask implements Task, S
@Override
public Task createTask(URI uri) {
- return new LinkTask(uri, this.getDepth());
+ return new CrawlerTask(this, uri, this.getDepth() + 1);
}
@Override
public String toString() {
return "(" + getURI().toString() + "," + getDepth() + ")";
}
+ @Override
+ public int getDepth() {
+ return depth;
+ }
+
+ @Override
+ public LinkedTask getFrom() {
+ return from;
+ }
+
+ @Override
+ public Collection<URI> getTo() {
+ return linksTo;
+ }
+
+ /**
+ * Set Outgoing links.
+ *
+ * @param linksTo Collection<URI>
+ */
+ public void setLinksTo(Collection<URI> linksTo) {
+ this.linksTo = linksTo;
+ }
+
+ @Override
+ public String getAnchorText() {
+ return anchorText;
+ }
+
+ /**
+ * Set the anchor text for this link.
+ *
+ * @param anchorText String
+ */
+ public void setAnchorText(String anchorText) {
+ this.anchorText = anchorText;
+ }
+
}
\ No newline at end of file
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java Wed Jan 30 15:30:35 2013
@@ -21,13 +21,14 @@ import java.util.Collection;
import java.util.Queue;
import org.apache.droids.core.AbstractDroid;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.TaskMaster;
import org.apache.droids.core.Worker;
import org.apache.droids.fetcher.CrawlingFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public abstract class CrawlingDroid extends AbstractDroid<LinkTask> {
+public abstract class CrawlingDroid extends AbstractDroid<LinkedTask> {
protected final Logger logger = LoggerFactory.getLogger(CrawlingDroid.class);
protected Collection<String> initialLocations;
@@ -42,7 +43,7 @@ public abstract class CrawlingDroid exte
* @param queue Queue<Link>
* @param taskMaster TaskMaster<Link>
*/
- public CrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
+ public CrawlingDroid(Queue<LinkedTask> queue, TaskMaster<LinkedTask> taskMaster) {
super(queue, taskMaster);
this.setFetcher(new CrawlingFetcher());
}
@@ -55,6 +56,6 @@ public abstract class CrawlingDroid exte
* @see org.apache.droids.core.Droid#getNewWorker()
* @return Worker<Link>
*/
- public abstract Worker<LinkTask> getNewWorker();
+ public abstract Worker<LinkedTask> getNewWorker();
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java Wed Jan 30 15:30:35 2013
@@ -17,6 +17,7 @@
package org.apache.droids.crawler;
import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.Worker;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Set;
-public class CrawlingWorker implements Worker<LinkTask> {
+public class CrawlingWorker implements Worker<LinkedTask> {
private static final Logger logger = LoggerFactory.getLogger(CrawlingWorker.class);
@@ -35,7 +36,7 @@ public class CrawlingWorker implements W
}
@Override
- public void execute(LinkTask task) throws DroidsException, IOException {
+ public void execute(LinkedTask task) throws DroidsException, IOException {
if (logger.isInfoEnabled()) {
logger.info("Loading " + task.getURI());
}
@@ -43,9 +44,9 @@ public class CrawlingWorker implements W
droid.parse(task);
// add this to a link handler
- Set<LinkTask> links = task.getContentEntity().getLinks();
+ Set<LinkedTask> links = task.getContentEntity().getLinks();
if (links != null) {
- for (LinkTask outLink : links) {
+ for (LinkedTask outLink : links) {
if (droid.filter(outLink) != null) {
droid.add(outLink);
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java Wed Jan 30 15:30:35 2013
@@ -23,6 +23,8 @@ import java.net.URISyntaxException;
import java.util.Queue;
import com.google.common.base.Preconditions;
+
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.TaskMaster;
import org.apache.droids.core.Worker;
@@ -31,7 +33,7 @@ import org.apache.droids.core.Worker;
*/
public class SimpleCrawlingDroid extends CrawlingDroid {
- public SimpleCrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
+ public SimpleCrawlingDroid(Queue<LinkedTask> queue, TaskMaster<LinkedTask> taskMaster) {
super(queue, taskMaster);
}
@@ -42,7 +44,7 @@ public class SimpleCrawlingDroid extends
for (String location : initialLocations) {
try {
URI uri = new URI(location);
- queue.offer(new LinkTask(uri));
+ queue.offer(new CrawlerTask(uri));
} catch (URISyntaxException ex) {
logger.error(ex.getMessage());
}
@@ -51,7 +53,7 @@ public class SimpleCrawlingDroid extends
}
@Override
- public Worker<LinkTask> getNewWorker() {
+ public Worker<LinkedTask> getNewWorker() {
return new CrawlingWorker(this);
}
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java Wed Jan 30 15:30:35 2013
@@ -1,7 +1,7 @@
package org.apache.droids.fetcher;
import org.apache.droids.core.Fetcher;
-import org.apache.droids.crawler.LinkTask;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.norobots.ContentLoader;
import org.apache.droids.norobots.HttpClientContentLoader;
import org.apache.droids.norobots.NoRobotClient;
@@ -26,7 +26,7 @@ import java.net.URISyntaxException;
*
*
*/
-public class CrawlingFetcher implements Fetcher<LinkTask> {
+public class CrawlingFetcher implements Fetcher<LinkedTask> {
private boolean forceAllow;
private String userAgent;
private final HttpClient httpClient;
@@ -46,7 +46,7 @@ public class CrawlingFetcher implements
}
@Override
- public boolean isAllowed(LinkTask task) throws IOException {
+ public boolean isAllowed(LinkedTask task) throws IOException {
if (forceAllow) {
return forceAllow;
}
@@ -77,7 +77,7 @@ public class CrawlingFetcher implements
}
@Override
- public void fetch(LinkTask task) throws IOException {
+ public void fetch(LinkedTask task) throws IOException {
HttpGet httpget = new HttpGet(task.getURI());
HttpResponse response = httpClient.execute(httpget);
StatusLine statusline = response.getStatusLine();
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java?rev=1440466&r1=1440465&r2=1440466&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java Wed Jan 30 15:30:35 2013
@@ -1,14 +1,11 @@
package org.apache.droids.crawler;
-import junit.framework.Assert;
-import org.apache.droids.core.Droid;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.SimpleTaskQueueWithHistory;
import org.apache.droids.core.TaskMaster;
import org.apache.droids.crawler.localserver.LocalHttpServer;
import org.apache.droids.crawler.localserver.ResourceHandler;
-import org.apache.droids.filter.HostFilter;
import org.apache.droids.handle.ReportHandler;
-import org.apache.droids.handle.SysoutHandler;
import org.apache.droids.parse.SimpleLinkParser;
import org.apache.droids.taskmaster.SequentialTaskMaster;
import static org.junit.Assert.*;
@@ -47,8 +44,8 @@ public class CrawlingDroidTest {
String targetURI = baseURI + "/start_html";
- Queue<LinkTask> queue = new SimpleTaskQueueWithHistory<LinkTask>();
- TaskMaster<LinkTask> taskMaster = new SequentialTaskMaster<LinkTask>();
+ Queue<LinkedTask> queue = new SimpleTaskQueueWithHistory<LinkedTask>();
+ TaskMaster<LinkedTask> taskMaster = new SequentialTaskMaster<LinkedTask>();
Collection<String> initialLocations = new LinkedList<String>();
initialLocations.add(targetURI);
@@ -56,7 +53,7 @@ public class CrawlingDroidTest {
SimpleCrawlingDroid droid = new SimpleCrawlingDroid(queue, taskMaster);
droid.setInitialLocations(initialLocations);
- droid.addParsers(new SimpleLinkParser());
+ droid.addParsers(new SimpleLinkParser<LinkedTask>());
// just output the filename