You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by to...@apache.org on 2013/01/29 17:27:44 UTC
svn commit: r1440010 - in
/incubator/droids/branches/0.2.x-cleanup/droids-crawler: ./
src/main/java/org/apache/droids/crawler/
src/main/java/org/apache/droids/fetcher/
src/main/java/org/apache/droids/norobots/
src/main/java/org/apache/droids/protocol/h...
Author: tobr
Date: Tue Jan 29 17:27:43 2013
New Revision: 1440010
URL: http://svn.apache.org/viewvc?rev=1440010&view=rev
Log:
added simple crawler module
simplified the usage of HttpClient
added back the test http server from examples
Added:
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java (with props)
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page1_html
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page2_html
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page3_html
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page4_html
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/start_html
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml (with props)
Removed:
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/ReportCrawlingDroid.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/protocol/http/
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/AppTest.java
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-crawler/pom.xml
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/pom.xml?rev=1440010&r1=1440009&r2=1440010&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/pom.xml (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/pom.xml Tue Jan 29 17:27:43 2013
@@ -9,7 +9,7 @@
<version>0.3.0-incubating-SNAPSHOT</version>
</parent>
<artifactId>droids-crawler</artifactId>
- <name>droids-crawler</name>
+ <name>APACHE DROIDS CRAWLER</name>
<dependencies>
<dependency>
<groupId>org.apache.droids</groupId>
@@ -17,10 +17,26 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids-norobots</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>4.2.2</version>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>${logback.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java?rev=1440010&r1=1440009&r2=1440010&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingDroid.java Tue Jan 29 17:27:43 2013
@@ -16,57 +16,36 @@
*/
package org.apache.droids.crawler;
-import java.net.URI;
-import java.net.URISyntaxException;
import java.util.Collection;
-import com.google.common.base.Preconditions;
-
import java.util.Queue;
import org.apache.droids.core.AbstractDroid;
import org.apache.droids.core.TaskMaster;
import org.apache.droids.core.Worker;
-import org.apache.droids.exception.InvalidTaskException;
-
-public abstract class CrawlingDroid extends AbstractDroid<Link> {
+import org.apache.droids.fetcher.CrawlingFetcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class CrawlingDroid extends AbstractDroid<LinkTask> {
+ protected Collection<String> initialLocations;
+ protected final Logger logger = LoggerFactory.getLogger(CrawlingDroid.class);
- private Collection<String> initialLocations;
+ public CrawlingDroid() {
+ this(null, null);
+ }
- public CrawlingDroid(Queue<Link> queue, TaskMaster<Link> taskMaster) {
+ public CrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
super(queue, taskMaster);
+ this.setFetcher(new CrawlingFetcher());
}
public void setInitialLocations(Collection<String> initialLocations) {
this.initialLocations = initialLocations;
}
- @Override
- public void init() throws InvalidTaskException {
- Preconditions.checkState(initialLocations != null,
- "WebCrawlerDroid requires at least one starting file");
- Preconditions.checkState(!initialLocations.isEmpty(),
- "WebCrawlerDroid requires at least one starting file");
- for (String location : initialLocations) {
- URI uri;
- try {
- uri = new URI(location);
- } catch (URISyntaxException ex) {
- throw new InvalidTaskException("Invalid lication: " + location);
- }
- queue.offer(new LinkTask(null, uri, 0));
- }
- }
-
- public void start() {
- taskMaster.start(queue, this);
- }
-
- @Override
- public void finished() {
- logger.info("FINISHED!!!");
+ public Worker<LinkTask> getNewWorker() {
+ return new CrawlingWorker(this);
}
- public abstract Worker<Link> getNewWorker();
-
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java?rev=1440010&r1=1440009&r2=1440010&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlingWorker.java Tue Jan 29 17:27:43 2013
@@ -16,123 +16,43 @@
*/
package org.apache.droids.crawler;
-import java.io.IOException;
-import java.net.URI;
-import java.util.Collection;
-import java.util.LinkedHashMap;
-import java.util.Map;
-
import org.apache.droids.core.DroidsException;
-import org.apache.droids.core.Protocol;
-import org.apache.droids.core.Task;
import org.apache.droids.core.Worker;
-import org.apache.droids.helper.factories.HandlerFactory;
-import org.apache.droids.helper.factories.URLFiltersFactory;
-import org.apache.droids.parse.Parse;
-import org.apache.droids.parse.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public class CrawlingWorker implements Worker<Link> {
+import java.io.IOException;
+import java.util.Set;
+
+public class CrawlingWorker implements Worker<LinkTask> {
- private static final Logger LOG = LoggerFactory.getLogger(CrawlingWorker.class);
+ private static final Logger logger = LoggerFactory.getLogger(CrawlingWorker.class);
private final CrawlingDroid droid;
- HandlerFactory handlerFactory;
public CrawlingWorker(CrawlingDroid droid) {
this.droid = droid;
}
@Override
- public void execute(Link link) throws DroidsException, IOException {
- final String userAgent = this.getClass().getCanonicalName();
- if (LOG.isDebugEnabled()) {
- LOG.debug("Starting " + userAgent);
- }
- URI uri = link.getURI();
- final Protocol protocol = droid.getProtocolFactory().getProtocol(uri);
- if (protocol == null) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Unsupported protocol scheme '" + uri.getScheme() + "'");
- }
- return;
+ public void execute(LinkTask task) throws DroidsException, IOException {
+ if (logger.isInfoEnabled()) {
+ logger.info("Loading " + task.getURI());
}
+ droid.load(task);
+ droid.parse(task);
- if (protocol.isAllowed(uri)) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Loading " + uri);
- }
-// ContentEntity entity = null;
- try {
-// entity = protocol.load(uri);
- } catch (OutOfMemoryError e) {
- LOG.error("Out of memory processing: " + uri + " skipping", e);
- throw new DroidsException(e);
- }
- try {
-// String contentType = entity.getMimeType();
- String contentType = "";
- if (LOG.isDebugEnabled()) {
- LOG.debug("Content type " + contentType);
- }
- if (contentType == null) {
- LOG.info("Missing content type... can't parse...");
- } else {
- Parser parser = droid.getParserFactory().getParser(contentType);
- if (parser == null) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Could not find parser for " + contentType);
- }
- } else {
-// Parse parse = parser.parse(entity, link);
- Parse parse = null;
- if (parse.getNewTasks() != null && parse.isFollowed()) {
- Collection<Link> outlinks = getFilteredOutlinks(parse);
- droid.getQueue().addAll(outlinks);
- }
-// entity.setParse(parse);
- handle(link);
- }
+ // add this to a link handler
+ Set<LinkTask> links = task.getContentEntity().getLinks();
+ if (links != null) {
+ for (LinkTask outLink : links) {
+ if (droid.filter(outLink) != null) {
+ droid.add(outLink);
}
- } finally {
-// entity.finish();
- }
- } else {
- if (LOG.isInfoEnabled()) {
- LOG.info("Stopping processing since" + " bots are not allowed for " + uri);
}
}
+ droid.handle(task);
+ droid.finish(task);
}
- protected void handle(Task task) throws DroidsException, IOException {
- getHandlerFactory().handle(task);
- }
-
- protected Collection<Link> getFilteredOutlinks(Parse parse) {
- URLFiltersFactory filters = droid.getFiltersFactory();
-
- // TODO -- make the hashvalue for Outlink...
- Map<URI, Link> filtered = new LinkedHashMap<URI, Link>();
- for (Task outTask : parse.getNewTasks()) {
- // only use Links, so if for some reason it isn't a Link, skip
- if (!(outTask instanceof Link)) {
- continue;
- }
- Link outlink = (Link) outTask;
- URI uri = outlink.getURI();
- if (filters.accept(outlink) && !filtered.containsKey(uri)) {
- filtered.put(uri, outlink);
- }
- }
- return filtered.values();
- }
-
- public HandlerFactory getHandlerFactory() {
- return handlerFactory;
- }
-
- public void setHandlerFactory(HandlerFactory handlerFactory) {
- this.handlerFactory = handlerFactory;
- }
}
Modified: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java?rev=1440010&r1=1440009&r2=1440010&view=diff
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java (original)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/LinkTask.java Tue Jan 29 17:27:43 2013
@@ -16,73 +16,48 @@
*/
package org.apache.droids.crawler;
+import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.Task;
+
import java.net.URI;
-import java.util.Collection;
import java.util.Date;
-import java.io.IOException;
-import java.io.InputStream;
import java.io.Serializable;
/**
- * Basic implementation for @Link. LinkTasks are working instructions for URI
+ * Basic implementation for Crawler @Task. LinkTasks are working instructions for URI
* based droids.
*/
-public class LinkTask implements Link, Serializable {
+public class LinkTask implements Task, Serializable {
private static final long serialVersionUID = -44808094386453088L;
+ private ContentEntity contentEntity;
private Date started;
private final int depth;
private final URI uri;
- private final Link from;
- private Date lastModifiedDate;
- private Collection<URI> linksTo;
- private String anchorText;
- private int weight;
private boolean aborted = false;
/**
* Creates a new LinkTask.
*
- * @param from
- * @param uri
- * @param depth
+ * @param uri The URI of the task.
*/
- public LinkTask(Link from, URI uri, int depth) {
- this.from = from;
- this.uri = uri;
- this.depth = depth;
- this.started = new Date();
+ public LinkTask(URI uri) {
+ this(uri, 0);
}
/**
* Creates a new LinkTask.
*
- * @param from
- * @param uri
- * @param depth
- * @param weight
+ * @param uri The URI of the task.
+ * @param depth The depth of the task.
*/
- public LinkTask(Link from, URI uri, int depth, int weight) {
- this.from = from;
+ public LinkTask(URI uri, int depth) {
this.uri = uri;
this.depth = depth;
this.started = new Date();
- this.weight = weight;
- }
-
- /**
- * Creates a new LinkTask
- *
- * @param from
- * @param uri
- * @param depth
- * @param anchorText
- */
- public LinkTask(Link from, URI uri, int depth, String anchorText) {
- this(from, uri, depth);
- this.anchorText = anchorText;
+ this.contentEntity = new ContentEntity();
}
@Override
@@ -91,9 +66,8 @@ public class LinkTask implements Link, S
}
@Override
- public InputStream getContent() throws IOException {
- // TODO Auto-generated method stub
- return null;
+ public ContentEntity getContentEntity() {
+ return this.contentEntity;
}
@Override
@@ -101,93 +75,28 @@ public class LinkTask implements Link, S
return started;
}
- /**
- * Set the Date the task started.
- *
- * @param started
- */
- public void setTaskDate(Date started) {
- this.started = started;
- }
-
@Override
public int getDepth() {
return depth;
}
@Override
- public Link getFrom() {
- return from;
- }
-
- @Override
- public Collection<URI> getTo() {
- return linksTo;
- }
-
- @Override
- public Date getLastModifiedDate() {
- return lastModifiedDate;
- }
-
- /**
- * Set the Date the Task object was last modified.
- *
- * @param lastModifiedDate
- */
- public void setLastModifiedDate(Date lastModifiedDate) {
- this.lastModifiedDate = lastModifiedDate;
- }
-
- /**
- * Set Outgoing links.
- *
- * @param linksTo
- */
- public void setLinksTo(Collection<URI> linksTo) {
- this.linksTo = linksTo;
+ public void abort() {
+ aborted = true;
}
@Override
- public String getAnchorText() {
- return anchorText;
- }
-
- /**
- * Set the anchor text for this link.
- *
- * @param anchorText
- */
- public void setAnchorText(String anchorText) {
- this.anchorText = anchorText;
- }
-
- /**
- * Get the weight of the link
- *
- * @return the links weight
- */
- public int getWeight() {
- return weight;
- }
-
- /**
- * Set the weight of the link.
- *
- * @param weight
- */
- public void setWeight(int weight) {
- this.weight = weight;
+ public boolean isAborted() {
+ return aborted;
}
@Override
- public void abort() {
- aborted = true;
+ public Task createTask(URI uri) {
+ return new LinkTask(uri, this.getDepth());
}
@Override
- public boolean isAborted() {
- return aborted;
+ public String toString() {
+ return "(" + getURI().toString() + "," + getDepth() + ")";
}
-
}
\ No newline at end of file
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,54 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+package org.apache.droids.crawler;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Queue;
+
+import com.google.common.base.Preconditions;
+import org.apache.droids.core.TaskMaster;
+import org.apache.droids.handle.ReportHandler;
+
+
+/**
+ * This simple CrawlingDroid uses the ReportHandler to handle all retrieved files.
+ */
+public class SimpleCrawlingDroid extends CrawlingDroid {
+
+ public SimpleCrawlingDroid(Queue<LinkTask> queue, TaskMaster<LinkTask> taskMaster) {
+ super(queue, taskMaster);
+ }
+
+ @Override
+ public void start() {
+ Preconditions.checkState(initialLocations != null || !initialLocations.isEmpty(),
+ "CrawlingDroid requires at least one starting file");
+ for (String location : initialLocations) {
+ try {
+ URI uri = new URI(location);
+ queue.offer(new LinkTask(uri));
+ } catch (URISyntaxException ex) {
+ logger.error(ex.getMessage());
+ }
+ }
+ super.start();
+ }
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/crawler/SimpleCrawlingDroid.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,134 @@
+package org.apache.droids.fetcher;
+
+import org.apache.droids.core.Fetcher;
+import org.apache.droids.crawler.LinkTask;
+import org.apache.droids.norobots.ContentLoader;
+import org.apache.droids.norobots.HttpClientContentLoader;
+import org.apache.droids.norobots.NoRobotClient;
+import org.apache.droids.norobots.NoRobotException;
+import org.apache.http.*;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.SystemDefaultHttpClient;
+import org.apache.http.params.CoreConnectionPNames;
+import org.apache.http.params.CoreProtocolPNames;
+import org.apache.http.params.HttpParams;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+/**
+ *
+ *
+ *
+ */
+public class CrawlingFetcher implements Fetcher<LinkTask> {
+ private boolean forceAllow;
+ private String userAgent;
+ private final HttpClient httpClient;
+ private final ContentLoader contentLoader;
+ private final static String DROIDS_USER_AGENT = "Apache-Droids/0.3 (java 1.5)";
+ private final static Logger logger = LoggerFactory.getLogger(CrawlingFetcher.class);
+
+ public CrawlingFetcher() {
+ this(DROIDS_USER_AGENT);
+ }
+
+ public CrawlingFetcher(String userAgent) {
+ this.httpClient = new SystemDefaultHttpClient();
+ this.contentLoader = new HttpClientContentLoader(httpClient);
+ this.userAgent = userAgent;
+ }
+
+ @Override
+ public boolean isAllowed(LinkTask task) throws IOException {
+ if (forceAllow) {
+ return forceAllow;
+ }
+ URI uri = task.getURI();
+ URI baseURI;
+ try {
+ baseURI = new URI(
+ uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(),
+ "/", null, null);
+ } catch (URISyntaxException ex) {
+ logger.error("Unable to determine base URI for " + uri);
+ return false;
+ }
+
+ NoRobotClient nrc = new NoRobotClient(contentLoader, userAgent);
+ try {
+ nrc.parse(baseURI);
+ } catch (NoRobotException ex) {
+ logger.error("Failure parsing robots.txt: " + ex.getMessage());
+ return false;
+ }
+ boolean test = nrc.isUrlAllowed(uri);
+ if (logger.isInfoEnabled()) {
+ logger.info(uri + " is " + (test ? "allowed" : "denied"));
+ }
+ return test;
+
+ }
+
+ @Override
+ public void fetch(LinkTask task) throws IOException {
+ HttpGet httpget = new HttpGet(task.getURI());
+ HttpResponse response = httpClient.execute(httpget);
+ StatusLine statusline = response.getStatusLine();
+// if (statusline.getStatusCode() >= HttpStatus.SC_BAD_REQUEST) {
+// httpget.abort();
+// throw new HttpResponseException(statusline.getStatusCode(), statusline.getReasonPhrase());
+// }
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ InputStream instream = entity.getContent();
+ task.getContentEntity().setContent(instream);
+ for (Header header : response.getAllHeaders()) {
+ task.getContentEntity().put(header.getName(), header.getValue());
+ }
+ }
+ }
+
+ public void setDefaultHttpParams(HttpParams params) {
+ httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent);
+ httpClient.getParams().setParameter(CoreProtocolPNames.USE_EXPECT_CONTINUE, false);
+ httpClient.getParams().setParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK, false);
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.MAX_HEADER_COUNT, 256);
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.MAX_LINE_LENGTH, 5 * 1024);
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 20000);
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
+ }
+
+
+ /**
+ * You can force that a site is allowed (ignoring the robots.txt). This should
+ * only be used on server that you control and where you have the permission
+ * to ignore the robots.txt.
+ *
+ * @return <code>true</code> if you are rude and ignore robots.txt.
+ * <code>false</code> if you are playing nice.
+ */
+ public boolean isForceAllow() {
+ return forceAllow;
+ }
+
+ /**
+ * You can force that a site is allowed (ignoring the robot.txt). This should
+ * only be used on server that you control and where you have the permission
+ * to ignore the robots.txt.
+ *
+ * @param forceAllow if you want to force an allow and ignore the robot.txt set
+ * to <code>true</code>. If you want to obey the rules and
+ * be polite set to <code>false</code>.
+ */
+ public void setForceAllow(boolean forceAllow) {
+ this.forceAllow = forceAllow;
+ }
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/CrawlingFetcher.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,27 @@
+package org.apache.droids.fetcher;
+
+import org.apache.http.impl.client.SystemDefaultHttpClient;
+import org.apache.http.params.CoreConnectionPNames;
+import org.apache.http.params.CoreProtocolPNames;
+import org.apache.http.params.HttpParams;
+
+import javax.annotation.concurrent.ThreadSafe;
+
+/**
+ *
+ *
+ *
+ */
+@ThreadSafe
+public class DroidsHttpClient extends SystemDefaultHttpClient {
+
+ public DroidsHttpClient() {
+ this(null);
+ }
+
+ public DroidsHttpClient(final HttpParams params) {
+ super(params);
+ }
+
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/fetcher/DroidsHttpClient.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,74 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+package org.apache.droids.norobots;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpHead;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+
+import org.apache.droids.norobots.ContentLoader;
+import org.slf4j.spi.LocationAwareLogger;
+
+/**
+ * {@link ContentLoader} based on HttpClient 4.0.
+ */
+public class HttpClientContentLoader implements ContentLoader {
+
+ private final HttpClient httpclient;
+
+ public HttpClientContentLoader(HttpClient httpclient) {
+ super();
+ this.httpclient = httpclient;
+ }
+
+ public boolean exists(URI uri) throws IOException {
+ HttpHead httphead = new HttpHead(uri);
+ HttpResponse response = httpclient.execute(httphead);
+ return response.getStatusLine().getStatusCode() == HttpStatus.SC_OK;
+ }
+
+ public InputStream load(URI uri) throws IOException {
+ HttpGet httpget = new HttpGet(uri);
+ HttpResponse response = httpclient.execute(httpget);
+ StatusLine statusline = response.getStatusLine();
+ if (statusline.getStatusCode() == HttpStatus.SC_NOT_FOUND) {
+ return null;
+ } else if (statusline.getStatusCode() > HttpStatus.SC_BAD_REQUEST) {
+ throw new HttpResponseException(statusline.getStatusCode(), statusline.getReasonPhrase());
+ }
+ HttpEntity entity = response.getEntity();
+ if (entity != null) {
+ return entity.getContent();
+ } else {
+ return null;
+ }
+ }
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/main/java/org/apache/droids/norobots/HttpClientContentLoader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,78 @@
+package org.apache.droids.crawler;
+
+import junit.framework.Assert;
+import org.apache.droids.core.Droid;
+import org.apache.droids.core.SimpleTaskQueueWithHistory;
+import org.apache.droids.core.TaskMaster;
+import org.apache.droids.crawler.localserver.LocalHttpServer;
+import org.apache.droids.crawler.localserver.ResourceHandler;
+import org.apache.droids.filter.HostFilter;
+import org.apache.droids.handle.ReportHandler;
+import org.apache.droids.handle.SysoutHandler;
+import org.apache.droids.parse.SimpleLinkParser;
+import org.apache.droids.taskmaster.SequentialTaskMaster;
+import static org.junit.Assert.*;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ *
+ *
+ *
+ */
+public class CrawlingDroidTest {
+
+ protected LocalHttpServer testserver;
+
+ @Before
+ public void initializeLocalTestServer() {
+ this.testserver = new LocalHttpServer();
+ }
+
+ @After
+ public void shutdownLocalTestServer() throws IOException {
+ this.testserver.stop();
+ }
+
+ @Test
+ public void test() throws IOException {
+ this.testserver.register("*", new ResourceHandler());
+ this.testserver.start();
+
+ String baseURI = "http:/" + this.testserver.getServiceAddress();
+ String targetURI = baseURI + "/start_html";
+
+
+ Queue<LinkTask> queue = new SimpleTaskQueueWithHistory<LinkTask>();
+ TaskMaster<LinkTask> taskMaster = new SequentialTaskMaster<LinkTask>();
+
+ Collection<String> initialLocations = new LinkedList<String>();
+ initialLocations.add(targetURI);
+
+
+ SimpleCrawlingDroid droid = new SimpleCrawlingDroid(queue, taskMaster);
+ droid.setInitialLocations(initialLocations);
+ droid.addParsers(new SimpleLinkParser());
+
+
+ // just output the filename
+ droid.addHandlers(new ReportHandler());
+
+ droid.start();
+
+
+ assertFalse(ReportHandler.getReport().isEmpty());
+ assertEquals(5, ReportHandler.getReport().size());
+ assertTrue(ReportHandler.getReport().contains(baseURI + "/start_html"));
+ assertTrue(ReportHandler.getReport().contains(baseURI + "/page1_html"));
+ assertTrue(ReportHandler.getReport().contains(baseURI + "/page2_html"));
+ assertTrue(ReportHandler.getReport().contains(baseURI + "/page3_html"));
+ assertTrue(ReportHandler.getReport().contains(baseURI + "/page4_html"));
+
+ ReportHandler.recycle();
+ }
+}
\ No newline at end of file
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/CrawlingDroidTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.localserver;
+
+import com.google.common.base.Preconditions;
+import org.apache.http.HttpException;
+import org.apache.http.HttpServerConnection;
+import org.apache.http.impl.DefaultConnectionReuseStrategy;
+import org.apache.http.impl.DefaultHttpResponseFactory;
+import org.apache.http.impl.DefaultHttpServerConnection;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.CoreConnectionPNames;
+import org.apache.http.params.CoreProtocolPNames;
+import org.apache.http.params.HttpParams;
+import org.apache.http.protocol.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.SocketAddress;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Local HTTP server for tests that require one.
+ */
+public class LocalHttpServer {
+
+ private final Logger log = LoggerFactory.getLogger(LocalHttpServer.class);
+ /**
+ * The local address to bind to. The host is an IP number rather than
+ * "localhost" to avoid surprises on hosts that map "localhost" to an IPv6
+ * address or something else. The port is 0 to let the system pick one.
+ */
+ public final static InetSocketAddress TEST_SERVER_ADDR = new InetSocketAddress("127.0.0.1", 0);
+ /**
+ * The request handler registry.
+ */
+ private final HttpRequestHandlerRegistry handlerRegistry;
+ /**
+ * The HTTP processor. If the interceptors are thread safe and the list is not
+ * modified during operation, the processor is thread safe.
+ */
+ private final BasicHttpProcessor httpProcessor;
+ /**
+ * The server parameters.
+ */
+ private final HttpParams params;
+ /**
+ * The server socket, while being served.
+ */
+ private volatile ServerSocket servicedSocket;
+ /**
+ * The request listening thread, while listening.
+ */
+ private volatile Thread listenerThread;
+ /**
+ * The number of connections this accepted.
+ */
+ private final AtomicInteger acceptedConnections = new AtomicInteger(0);
+
+ /**
+ * Creates a new test server.
+ */
+ public LocalHttpServer() {
+ this.handlerRegistry = new HttpRequestHandlerRegistry();
+ this.httpProcessor = new BasicHttpProcessor();
+ this.httpProcessor.addInterceptor(new ResponseDate());
+ this.httpProcessor.addInterceptor(new ResponseServer());
+ this.httpProcessor.addInterceptor(new ResponseContent());
+ this.httpProcessor.addInterceptor(new ResponseConnControl());
+ this.params = new BasicHttpParams();
+ this.params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, 5000).setIntParameter(CoreConnectionPNames.SOCKET_BUFFER_SIZE, 8 * 1024).setBooleanParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK, false).setBooleanParameter(CoreConnectionPNames.TCP_NODELAY, true).setParameter(CoreProtocolPNames.ORIGIN_SERVER, "LocalTestServer/1.1");
+ }
+
+ /**
+ * Returns the number of connections this test server has accepted.
+ */
+ public int getAcceptedConnectionCount() {
+ return this.acceptedConnections.get();
+ }
+
+ /**
+ * Registers a handler with the local registry.
+ *
+ * @param pattern the URL pattern to match
+ * @param handler the handler to apply
+ */
+ public void register(String pattern, HttpRequestHandler handler) {
+ this.handlerRegistry.register(pattern, handler);
+ }
+
+ /**
+ * Unregisters a handler from the local registry.
+ *
+ * @param pattern the URL pattern
+ */
+ public void unregister(String pattern) {
+ this.handlerRegistry.unregister(pattern);
+ }
+
+ /**
+ * Starts this test server. Use {@link #getServicePort getServicePort} to
+ * obtain the port number afterwards.
+ */
+ public void start() throws IOException {
+ if (servicedSocket != null) {
+ return; // Already running
+ }
+
+ ServerSocket ssock = new ServerSocket();
+ ssock.setReuseAddress(true); // probably pointless for port '0'
+ ssock.bind(TEST_SERVER_ADDR);
+ this.servicedSocket = ssock;
+
+ this.listenerThread = new Thread(new RequestListener());
+ this.listenerThread.setDaemon(false);
+ this.listenerThread.start();
+ }
+
+ /**
+ * Stops this test server.
+ */
+ public void stop() throws IOException {
+ if (this.servicedSocket == null) {
+ return; // not running
+ }
+
+ try {
+ this.servicedSocket.close();
+ } catch (IOException ex) {
+ log.error(ex.getMessage(), ex);
+ } finally {
+ this.servicedSocket = null;
+ }
+
+ if (this.listenerThread != null) {
+ this.listenerThread.interrupt();
+ this.listenerThread = null;
+ }
+ }
+
+ @Override
+ public String toString() {
+ ServerSocket ssock = servicedSocket; // avoid synchronization
+ StringBuffer sb = new StringBuffer(80);
+ sb.append("LocalTestServer/");
+ if (ssock == null) {
+ sb.append("stopped");
+ } else {
+ sb.append(ssock.getLocalSocketAddress());
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Obtains the port this server is servicing.
+ *
+ * @return the service port
+ */
+ public int getServicePort() {
+ ServerSocket ssock = this.servicedSocket; // avoid synchronization
+ Preconditions.checkState(ssock != null, "not running");
+ return ssock.getLocalPort();
+ }
+
+ /**
+ * Obtains the local address the server is listening on
+ *
+ * @return the service address
+ */
+ public SocketAddress getServiceAddress() {
+ ServerSocket ssock = this.servicedSocket; // avoid synchronization
+ Preconditions.checkState(ssock != null, "not running");
+ return ssock.getLocalSocketAddress();
+ }
+
+ /**
+ * The request listener. Accepts incoming connections and launches a service
+ * thread.
+ */
+ public class RequestListener implements Runnable {
+
+ /**
+ * The workers launched from here.
+ */
+ private final Set<Thread> workerThreads;
+
+ public RequestListener() {
+ super();
+ this.workerThreads = Collections.synchronizedSet(new HashSet<Thread>());
+ }
+
+ public void run() {
+ try {
+ while ((servicedSocket != null) && (listenerThread == Thread.currentThread())
+ && !Thread.interrupted()) {
+ try {
+ accept();
+ } catch (Exception ex) {
+ ServerSocket ssock = servicedSocket;
+ if ((ssock != null) && !ssock.isClosed()) {
+ log.error(LocalHttpServer.this.toString() + " could not accept", ex);
+ }
+ // otherwise ignore the exception silently
+ break;
+ }
+ }
+ } finally {
+ cleanup();
+ }
+ }
+
+ protected void accept() throws IOException {
+ // Set up HTTP connection
+ Socket socket = servicedSocket.accept();
+ acceptedConnections.incrementAndGet();
+ DefaultHttpServerConnection conn = new DefaultHttpServerConnection();
+ conn.bind(socket, params);
+
+ // Set up the HTTP service
+ HttpService httpService = new HttpService(httpProcessor,
+ new DefaultConnectionReuseStrategy(), new DefaultHttpResponseFactory());
+ httpService.setParams(params);
+ httpService.setHandlerResolver(handlerRegistry);
+
+ // Start worker thread
+ Thread t = new Thread(new Worker(httpService, conn));
+ workerThreads.add(t);
+ t.setDaemon(true);
+ t.start();
+
+ }
+
+ protected void cleanup() {
+ Thread[] threads = workerThreads.toArray(new Thread[0]);
+ for (int i = 0; i < threads.length; i++) {
+ if (threads[i] != null) {
+ threads[i].interrupt();
+ }
+ }
+ }
+
+ /**
+ * A worker for serving incoming requests.
+ */
+ public class Worker implements Runnable {
+
+ private final HttpService httpservice;
+ private final HttpServerConnection conn;
+
+ public Worker(final HttpService httpservice, final HttpServerConnection conn) {
+
+ this.httpservice = httpservice;
+ this.conn = conn;
+ }
+
+ public void run() {
+ HttpContext context = new BasicHttpContext(null);
+ try {
+ while ((servicedSocket != null) && this.conn.isOpen() && !Thread.interrupted()) {
+ this.httpservice.handleRequest(this.conn, context);
+ }
+ } catch (IOException ex) {
+ // ignore silently
+ } catch (HttpException ex) {
+ // ignore silently
+ } finally {
+ workerThreads.remove(Thread.currentThread());
+ try {
+ this.conn.shutdown();
+ } catch (IOException ignore) {
+ }
+ }
+ }
+ }
+ }
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/LocalHttpServer.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java Tue Jan 29 17:27:43 2013
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler.localserver;
+
+import org.apache.http.*;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.protocol.HttpContext;
+import org.apache.http.protocol.HttpRequestHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Locale;
+
+/**
+ * A handler that serves out a resource
+ */
+public class ResourceHandler implements HttpRequestHandler {
+
+ public void handle(final HttpRequest request, final HttpResponse response,
+ final HttpContext context) throws HttpException, IOException {
+
+ String method = request.getRequestLine().getMethod().toUpperCase(Locale.ENGLISH);
+ if (!"GET".equals(method) && !"HEAD".equals(method)) {
+ throw new MethodNotSupportedException(method + " not supported by " + getClass().getName());
+ }
+ String requestURI = request.getRequestLine().getUri();
+ String s = requestURI;
+ if (!s.startsWith("/")) {
+ s = "/" + s;
+ }
+ s = "data" + s;
+
+ ClassLoader cl = ResourceHandler.class.getClassLoader();
+ URL resource = cl.getResource(s);
+
+ if (resource != null) {
+ InputStream instream = resource.openStream();
+ InputStreamEntity entity = new InputStreamEntity(instream, -1);
+ if (requestURI.endsWith("_html")) {
+ entity.setContentType("text/html");
+ entity.setChunked(true);
+ }
+ response.setEntity(entity);
+
+ } else {
+ response.setStatusCode(HttpStatus.SC_NOT_FOUND);
+ StringEntity entity = new StringEntity(requestURI + " not found", "US-ASCII");
+ entity.setContentType("text/html");
+ response.setEntity(entity);
+ }
+ }
+
+}
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/java/org/apache/droids/crawler/localserver/ResourceHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page1_html
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page1_html?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page1_html (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page1_html Tue Jan 29 17:27:43 2013
@@ -0,0 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>Page 1</head>
+ <body>
+ <a href="/page3_html">Page1</a>
+ </body>
+</html>
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page2_html
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page2_html?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page2_html (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page2_html Tue Jan 29 17:27:43 2013
@@ -0,0 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>Page 2</head>
+ <body>
+ <a href="/page4_html">Page1</a>
+ </body>
+</html>
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page3_html
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page3_html?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page3_html (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page3_html Tue Jan 29 17:27:43 2013
@@ -0,0 +1,23 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<html>
+ <head>Page 4</head>
+ <body>
+ <p>Yada yada</p>
+ </body>
+</html>
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page4_html
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page4_html?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page4_html (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/page4_html Tue Jan 29 17:27:43 2013
@@ -0,0 +1,22 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>Page 4</head>
+ <body>
+ <p>Blah blah blah</p>
+ </body>
+</html>
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/start_html
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/start_html?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/start_html (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/data/start_html Tue Jan 29 17:27:43 2013
@@ -0,0 +1,23 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+ <head>Start page</head>
+ <body>
+ <a href="/page1_html">Page1</a>
+ <a href="/page2_html">Page1</a>
+ </body>
+</html>
Added: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml
URL: http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml?rev=1440010&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml Tue Jan 29 17:27:43 2013
@@ -0,0 +1,18 @@
+<configuration>
+
+ <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+ <!-- encoders are assigned the type
+ ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
+ <encoder>
+ <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
+ </encoder>
+ </appender>
+
+ <logger name="org.apache.droids" level="info"/>
+ <logger name="org.apache.http.wire" level="info"/>
+
+
+ <root level="info">
+ <appender-ref ref="STDOUT"/>
+ </root>
+</configuration>
\ No newline at end of file
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-crawler/src/test/resources/logback.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml