You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/06 13:30:36 UTC
svn commit: r711854 - in /incubator/droids/trunk/src: java/
java/org/apache/droids/api/ java/org/apache/droids/exception/
java/org/apache/droids/impl/ java/org/apache/droids/parse/html/
java/org/apache/droids/robot/crawler/ test/java/org/apache/droids/...
Author: olegk
Date: Thu Nov 6 05:30:18 2008
New Revision: 711854
URL: http://svn.apache.org/viewvc?rev=711854&view=rev
Log:
* Refactored exception handling code in HtmlParser
* Replaced URL calls with equivalent URI calls in HtmlParser
* Added SequentialTaskMaster intended to execute Tasks sequentially one at a time
* Added test case to test termination of Droid execution in case of an exception
Added:
incubator/droids/trunk/src/java/org/apache/droids/exception/ContentFormatViolationException.java
incubator/droids/trunk/src/java/org/apache/droids/exception/InvalidLinkException.java
incubator/droids/trunk/src/java/org/apache/droids/impl/SequentialTaskMaster.java
incubator/droids/trunk/src/test/java/org/apache/droids/DroidsFactory.java
Modified:
incubator/droids/trunk/src/java/log4j.properties
incubator/droids/trunk/src/java/org/apache/droids/api/Parser.java
incubator/droids/trunk/src/java/org/apache/droids/parse/html/HtmlParser.java
incubator/droids/trunk/src/java/org/apache/droids/robot/crawler/CrawlingWorker.java
incubator/droids/trunk/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
Modified: incubator/droids/trunk/src/java/log4j.properties
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/log4j.properties?rev=711854&r1=711853&r2=711854&view=diff
==============================================================================
--- incubator/droids/trunk/src/java/log4j.properties (original)
+++ incubator/droids/trunk/src/java/log4j.properties Thu Nov 6 05:30:18 2008
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-log4j.rootLogger=@loglevel@, stdout, logfile
+log4j.rootLogger=INFO, stdout, logfile
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
Modified: incubator/droids/trunk/src/java/org/apache/droids/api/Parser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/api/Parser.java?rev=711854&r1=711853&r2=711854&view=diff
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/api/Parser.java (original)
+++ incubator/droids/trunk/src/java/org/apache/droids/api/Parser.java Thu Nov 6 05:30:18 2008
@@ -16,8 +16,11 @@
*/
package org.apache.droids.api;
+import java.io.IOException;
import java.io.InputStream;
+import org.apache.droids.exception.DroidsException;
+
/**
* Simple parser that is only forcing to return a parse object.
*
@@ -35,5 +38,5 @@
* the link that correspond to the stream
* @return the parse object
*/
- Parse getParse(InputStream openStream, Link link);
+ Parse getParse(InputStream openStream, Link link) throws DroidsException, IOException;
}
Added: incubator/droids/trunk/src/java/org/apache/droids/exception/ContentFormatViolationException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/exception/ContentFormatViolationException.java?rev=711854&view=auto
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/exception/ContentFormatViolationException.java (added)
+++ incubator/droids/trunk/src/java/org/apache/droids/exception/ContentFormatViolationException.java Thu Nov 6 05:30:18 2008
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.exception;
+
+/**
+ * Signals content format violation.
+ *
+ * @version 1.0
+ */
+public class ContentFormatViolationException extends DroidsException {
+
+ private static final long serialVersionUID = -3897055120550880304L;
+
+ public ContentFormatViolationException(String message, Throwable cause) {
+ super(message);
+ initCause(cause);
+ }
+
+}
Added: incubator/droids/trunk/src/java/org/apache/droids/exception/InvalidLinkException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/exception/InvalidLinkException.java?rev=711854&view=auto
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/exception/InvalidLinkException.java (added)
+++ incubator/droids/trunk/src/java/org/apache/droids/exception/InvalidLinkException.java Thu Nov 6 05:30:18 2008
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.exception;
+
+/**
+ * Signals Link format violation.
+ *
+ * @version 1.0
+ */
+public class InvalidLinkException extends DroidsException {
+
+ private static final long serialVersionUID = 5608058374859478284L;
+
+ public InvalidLinkException(String message) {
+ super(message);
+ }
+
+ public InvalidLinkException(String message, Throwable cause) {
+ super(message);
+ initCause(cause);
+ }
+
+}
Added: incubator/droids/trunk/src/java/org/apache/droids/impl/SequentialTaskMaster.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/impl/SequentialTaskMaster.java?rev=711854&view=auto
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/impl/SequentialTaskMaster.java (added)
+++ incubator/droids/trunk/src/java/org/apache/droids/impl/SequentialTaskMaster.java Thu Nov 6 05:30:18 2008
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.impl;
+
+import java.util.Date;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.droids.api.DelayTimer;
+import org.apache.droids.api.Droid;
+import org.apache.droids.api.Task;
+import org.apache.droids.api.TaskExceptionHandler;
+import org.apache.droids.api.TaskExceptionResult;
+import org.apache.droids.api.TaskMaster;
+import org.apache.droids.api.TaskQueue;
+import org.apache.droids.api.Worker;
+import org.apache.droids.helper.Loggable;
+
+public class SequentialTaskMaster<T extends Task>
+ extends Loggable implements TaskMaster<T>
+{
+ private final Object mutex;
+
+ private volatile boolean completed;
+ private volatile Date startedWorking = null;
+ private volatile Date finishedWorking = null;
+ private volatile int completedTask = 0;
+ private volatile T lastCompletedTask = null;
+
+ private DelayTimer delayTimer = null;
+ private TaskExceptionHandler exHandler = null;
+
+ public SequentialTaskMaster() {
+ super();
+ this.mutex = new Object();
+ }
+
+ /**
+ * The queue has been initialized
+ */
+ public synchronized void processAllTasks(
+ final TaskQueue<T> queue, final Droid<T> droid)
+ {
+ this.completed = false;
+ this.startedWorking = new Date();
+ this.finishedWorking = null;
+ this.completedTask = 0;
+
+ boolean terminated = false;
+ while( !terminated ) {
+ T task = queue.next();
+ if (task == null) {
+ break;
+ }
+ if( delayTimer != null ) {
+ long delay = delayTimer.getDelayMillis();
+ if( delay > 0 ) {
+ try {
+ Thread.sleep( delay );
+ }
+ catch (InterruptedException e) {}
+ }
+ }
+ Worker<T> worker = droid.getNewWorker();
+ try {
+ worker.execute( task );
+ completedTask++;
+ lastCompletedTask = task;
+ } catch (Exception ex) {
+ TaskExceptionResult result = TaskExceptionResult.WARN;
+ if (exHandler != null) {
+ result = exHandler.handleException(ex);
+ }
+ switch (result) {
+ case WARN:
+ log.warn(ex.getMessage());
+ break;
+ case FATAL:
+ log.warn(ex.getMessage());
+ terminated = true;
+ break;
+ }
+ }
+ }
+ finishedWorking = new Date();
+ droid.finished();
+ synchronized( mutex ) {
+ completed = true;
+ mutex.notifyAll();
+ }
+ }
+
+ public final void setExceptionHandler(TaskExceptionHandler exHandler) {
+ this.exHandler = exHandler;
+ }
+
+
+ public final void setDelayTimer(DelayTimer delayTimer) {
+ this.delayTimer = delayTimer;
+ }
+
+
+ public Date getStartTime() {
+ return startedWorking;
+ }
+
+
+ public Date getFinishedWorking() {
+ return finishedWorking;
+ }
+
+
+ public int getCompletedTasks() {
+ return completedTask;
+ }
+
+
+ public T getLastCompletedTask() {
+ return lastCompletedTask;
+ }
+
+
+ public void awaitTermination(long timeout, TimeUnit unit) throws InterruptedException {
+ if (timeout < 0) {
+ timeout = 0;
+ }
+ synchronized (this.mutex) {
+ long deadline = System.currentTimeMillis() + unit.toMillis(timeout);
+ long remaining = timeout;
+ while (!completed) {
+ this.mutex.wait(remaining);
+ if (timeout > 0) {
+ remaining = deadline - System.currentTimeMillis();
+ if (remaining <= 0) {
+ break;
+ }
+ }
+ }
+ }
+ }
+
+}
Modified: incubator/droids/trunk/src/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/parse/html/HtmlParser.java?rev=711854&r1=711853&r2=711854&view=diff
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/src/java/org/apache/droids/parse/html/HtmlParser.java Thu Nov 6 05:30:18 2008
@@ -15,10 +15,10 @@
*/
package org.apache.droids.parse.html;
+import java.io.IOException;
import java.io.InputStream;
-import java.net.MalformedURLException;
import java.net.URI;
-import java.net.URL;
+import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@@ -27,6 +27,9 @@
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
+import org.apache.droids.exception.ContentFormatViolationException;
+import org.apache.droids.exception.DroidsException;
+import org.apache.droids.exception.InvalidLinkException;
import org.apache.droids.helper.Loggable;
import org.apache.droids.LinkTask;
import org.apache.droids.ParseData;
@@ -39,6 +42,7 @@
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
@@ -61,17 +65,13 @@
this.elements = elements;
}
- private URL base = null;
+ private URI base = null;
private Link link = null;
- public Parse getParse(InputStream stream, Link newLink) {
+ public Parse getParse(InputStream stream, Link newLink) throws DroidsException, IOException {
this.link = newLink;
- try {
- this.base = new URL(newLink.getId());
- } catch (MalformedURLException e1) {
- log.fatal(e1);
- }
+ this.base = newLink.getURI();
ParseData parseData = null;
// setup filter chain
final XMLDocumentFilter[] filters = { getRemover() };
@@ -81,21 +81,20 @@
// parse document
// XMLInputSource source = new XMLInputSource(null, uri, uri);
try {
- parser.parse(base.toExternalForm(), node);
- parseData = extract(node);
- } catch (Exception e) {
- log.fatal(e);
- return new ParseImpl(stream.toString(), null);
+ parser.parse(base.toString(), node);
+ } catch (SAXException ex) {
+ throw new ContentFormatViolationException("Failure parsing HTML content", ex);
}
+ parseData = extract(node);
return new ParseImpl(stream.toString(), parseData);
}
- private ParseData extract(DocumentFragment node) {
+ private ParseData extract(DocumentFragment node) throws InvalidLinkException {
final ArrayList<Link> links = new ArrayList<Link>();
try {
extractLinks(node, links, new HashSet<String>());
- } catch (MalformedURLException e) {
- log.fatal(e);
+ } catch (URISyntaxException ex) {
+ throw new InvalidLinkException(ex.getMessage(), ex);
}
return new ParseData(links);
}
@@ -113,10 +112,10 @@
parser.setFeature(
"http://cyberneko.org/html/features/report-errors",
false);
- } catch (SAXNotRecognizedException e) {
- log.fatal(e);
- } catch (SAXNotSupportedException e) {
- log.fatal(e);
+ } catch (SAXNotRecognizedException ex) {
+ throw new IllegalStateException(ex);
+ } catch (SAXNotSupportedException ex) {
+ throw new IllegalStateException(ex);
}
return parser;
}
@@ -135,7 +134,7 @@
}
private void extractLinks(Node node, ArrayList<Link> links,
- HashSet<String> set) throws MalformedURLException {
+ HashSet<String> set) throws URISyntaxException {
if (node.getNodeType() == Node.ELEMENT_NODE) {
String nodeName = node.getNodeName().toLowerCase();
if (elements.containsKey(nodeName)) {
@@ -147,32 +146,28 @@
String attrName = attr.getNodeName();
if (attrName.equalsIgnoreCase(value)) {
target = attr.getNodeValue();
- try {
- String newUrl = "";
- if(target.startsWith("/")){
- newUrl=base.getProtocol()+"://"+base.getHost();
- if(base.getPort()>-1){
- newUrl+=":"+base.getPort();
- }
- newUrl += target;
- }else if(!target.toLowerCase().startsWith("javascript")){
- newUrl=new URL(base, target).toString();
+ String newUrl = "";
+ if(target.startsWith("/")){
+ newUrl=base.getScheme() + "://"+base.getHost();
+ if(base.getPort()>-1){
+ newUrl+=":"+base.getPort();
}
- if (!newUrl.equals("")) {
- // Link from, URI uri, int depth, String text
- String url = target.contains(":/") ? target : newUrl;
- URI uri = new URI( url );
- final LinkTask outlink = new LinkTask( link, uri, link.getDepth()+1 );
- log.debug("set size: "+set.size());
- log.debug("outlink.getToUrl(): "+outlink.getURI());
- log.debug("set.contains(outlink.getToUrl(): "+set.contains(url));
- if (!set.contains(url)) {
- set.add(url);
- links.add(outlink);
- }
+ newUrl += target;
+ }else if(!target.toLowerCase().startsWith("javascript")){
+ newUrl = base.resolve(target).toString();
+ }
+ if (!newUrl.equals("")) {
+ // Link from, URI uri, int depth, String text
+ String url = target.contains(":/") ? target : newUrl;
+ URI uri = new URI( url );
+ final LinkTask outlink = new LinkTask( link, uri, link.getDepth()+1 );
+ log.debug("set size: "+set.size());
+ log.debug("outlink.getToUrl(): "+outlink.getURI());
+ log.debug("set.contains(outlink.getToUrl(): "+set.contains(url));
+ if (!set.contains(url)) {
+ set.add(url);
+ links.add(outlink);
}
- } catch (Exception e) {
- log.fatal(e);
}
}
}
Modified: incubator/droids/trunk/src/java/org/apache/droids/robot/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/java/org/apache/droids/robot/crawler/CrawlingWorker.java?rev=711854&r1=711853&r2=711854&view=diff
==============================================================================
--- incubator/droids/trunk/src/java/org/apache/droids/robot/crawler/CrawlingWorker.java (original)
+++ incubator/droids/trunk/src/java/org/apache/droids/robot/crawler/CrawlingWorker.java Thu Nov 6 05:30:18 2008
@@ -20,7 +20,7 @@
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
-import java.util.HashMap;
+import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.droids.api.Link;
@@ -105,7 +105,7 @@
URLFiltersFactory filters = droid.getFiltersFactory();
// TODO -- make the hashvalue for Outlink...
- Map<String,Link> filtered = new HashMap<String,Link>();
+ Map<String,Link> filtered = new LinkedHashMap<String,Link>();
for( Link outlink : parse.getData().getOutlinks() ) {
String id = outlink.getId();
if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
Added: incubator/droids/trunk/src/test/java/org/apache/droids/DroidsFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/test/java/org/apache/droids/DroidsFactory.java?rev=711854&view=auto
==============================================================================
--- incubator/droids/trunk/src/test/java/org/apache/droids/DroidsFactory.java (added)
+++ incubator/droids/trunk/src/test/java/org/apache/droids/DroidsFactory.java Thu Nov 6 05:30:18 2008
@@ -0,0 +1,97 @@
+package org.apache.droids;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+
+import org.apache.droids.api.Droid;
+import org.apache.droids.api.Handler;
+import org.apache.droids.api.Link;
+import org.apache.droids.api.URLFilter;
+import org.apache.droids.delay.SimpleDelayTimer;
+import org.apache.droids.helper.factories.HandlerFactory;
+import org.apache.droids.helper.factories.ParserFactory;
+import org.apache.droids.helper.factories.ProtocolFactory;
+import org.apache.droids.helper.factories.URLFiltersFactory;
+import org.apache.droids.impl.SequentialTaskMaster;
+import org.apache.droids.impl.SimpleTaskQueue;
+import org.apache.droids.parse.html.HtmlParser;
+import org.apache.droids.protocol.http.Http;
+import org.apache.droids.robot.crawler.CrawlingDroid;
+
+public class DroidsFactory
+{
+
+ public static ParserFactory createDefaultParserFactory() {
+ ParserFactory parserFactory = new ParserFactory();
+ HtmlParser htmlParser = new HtmlParser();
+ htmlParser.setElements(new HashMap<String, String>());
+ htmlParser.getElements().put("a", "href");
+ htmlParser.getElements().put("link", "href");
+ htmlParser.getElements().put("img", "src");
+ htmlParser.getElements().put("script", "src");
+ parserFactory.setMap(new HashMap<String, Object>());
+ parserFactory.getMap().put("text/html", htmlParser);
+ return parserFactory;
+ }
+
+ public static ProtocolFactory createDefaultProtocolFactory() {
+ ProtocolFactory protocolFactory = new ProtocolFactory();
+ Http httpProtocol = new Http();
+ httpProtocol.setForceAllow(true);
+ httpProtocol.setUserAgent("Droids/1.1");
+
+ protocolFactory.setMap(new HashMap<String, Object>());
+ protocolFactory.getMap().put("http", httpProtocol);
+ return protocolFactory;
+ }
+
+ public static URLFiltersFactory createDefaultURLFiltersFactory() {
+ URLFiltersFactory filtersFactory = new URLFiltersFactory();
+ URLFilter defaultURLFilter = new URLFilter() {
+
+ public String filter(String urlString) {
+ return urlString;
+ }
+
+ };
+ filtersFactory.setMap(new HashMap<String, Object>());
+ filtersFactory.getMap().put("default", defaultURLFilter);
+ return filtersFactory;
+ }
+
+ public static HandlerFactory createDefaultHandlerFactory(Handler defaultHandler) {
+ HandlerFactory handlerFactory = new HandlerFactory();
+ handlerFactory.setMap(new HashMap<String, Object>());
+ handlerFactory.getMap().put("default", defaultHandler);
+ return handlerFactory;
+ }
+
+ public static Droid<Link> createSimpleCrawlingDroid(
+ String targetURI, Handler testHandler) {
+ ParserFactory parserFactory = createDefaultParserFactory();
+ ProtocolFactory protocolFactory = createDefaultProtocolFactory();
+ URLFiltersFactory filtersFactory = createDefaultURLFiltersFactory();
+ HandlerFactory handlerFactory = createDefaultHandlerFactory(testHandler);
+
+ SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer();
+ simpleDelayTimer.setDelayMillis(100);
+
+ SimpleTaskQueue<Link> simpleQueue = new SimpleTaskQueue<Link>();
+
+ SequentialTaskMaster<Link> taskMaster = new SequentialTaskMaster<Link>();
+ taskMaster.setDelayTimer( simpleDelayTimer );
+
+ CrawlingDroid crawler = new CrawlingDroid( simpleQueue, taskMaster );
+ crawler.setFiltersFactory(filtersFactory);
+ crawler.setParserFactory(parserFactory);
+ crawler.setProtocolFactory(protocolFactory);
+ crawler.setHandlerFactory(handlerFactory);
+
+ Collection<String> initialLocations = new ArrayList<String>();
+ initialLocations.add( targetURI );
+ crawler.setInitialLocations(initialLocations);
+ return crawler;
+ }
+
+}
Modified: incubator/droids/trunk/src/test/java/org/apache/droids/impl/TestSimpleDroid.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/src/test/java/org/apache/droids/impl/TestSimpleDroid.java?rev=711854&r1=711853&r2=711854&view=diff
==============================================================================
--- incubator/droids/trunk/src/test/java/org/apache/droids/impl/TestSimpleDroid.java (original)
+++ incubator/droids/trunk/src/test/java/org/apache/droids/impl/TestSimpleDroid.java Thu Nov 6 05:30:18 2008
@@ -20,29 +20,23 @@
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import junit.framework.Assert;
+import org.apache.droids.DroidsFactory;
+import org.apache.droids.api.Droid;
+import org.apache.droids.api.Handler;
import org.apache.droids.api.Link;
import org.apache.droids.api.Parse;
-import org.apache.droids.api.Handler;
-import org.apache.droids.delay.SimpleDelayTimer;
-import org.apache.droids.helper.factories.DroidFactory;
-import org.apache.droids.helper.factories.HandlerFactory;
-import org.apache.droids.helper.factories.ParserFactory;
-import org.apache.droids.helper.factories.ProtocolFactory;
-import org.apache.droids.helper.factories.URLFiltersFactory;
+import org.apache.droids.api.TaskExceptionHandler;
+import org.apache.droids.api.TaskExceptionResult;
+import org.apache.droids.api.TaskMaster;
import org.apache.droids.localserver.LocalHttpServer;
import org.apache.droids.localserver.ResourceHandler;
-import org.apache.droids.net.RegexURLFilter;
-import org.apache.droids.parse.html.HtmlParser;
-import org.apache.droids.protocol.http.Http;
-import org.apache.droids.robot.crawler.CrawlingDroid;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -71,77 +65,81 @@
String baseURI = "http:/" + this.testserver.getServiceAddress();
String targetURI = baseURI + "/start_html";
- final Set<URL> visitedLinks = new HashSet<URL>();
-
- ParserFactory parserFactory = new ParserFactory();
- HtmlParser htmlParser = new HtmlParser();
- htmlParser.setElements(new HashMap<String, String>());
- htmlParser.getElements().put("a", "href");
- htmlParser.getElements().put("link", "href");
- htmlParser.getElements().put("img", "src");
- htmlParser.getElements().put("script", "src");
- parserFactory.setMap(new HashMap<String, Object>());
- parserFactory.getMap().put("text/html", htmlParser);
-
- ProtocolFactory protocolFactory = new ProtocolFactory();
- Http httpProtocol = new Http();
- httpProtocol.setForceAllow(true);
- httpProtocol.setUserAgent("Droids/1.1");
-
- protocolFactory.setMap(new HashMap<String, Object>());
- protocolFactory.getMap().put("http", httpProtocol);
-
- URLFiltersFactory filtersFactory = new URLFiltersFactory();
- RegexURLFilter defaultURLFilter = new RegexURLFilter();
- defaultURLFilter.setFile("classpath:/regex-urlfilter.txt");
- filtersFactory.setMap(new HashMap<String, Object>());
- filtersFactory.getMap().put("default", defaultURLFilter);
-
- HandlerFactory handlerFactory = new HandlerFactory();
- Handler defaultHandler = new Handler() {
+ final List<URL> visitedLinks = new ArrayList<URL>();
+
+ Handler testHandler = new Handler() {
public void handle(InputStream openStream, URL url, Parse parse) {
visitedLinks.add(url);
}
-
+
};
- handlerFactory.setMap(new HashMap<String, Object>());
- handlerFactory.getMap().put("default", defaultHandler);
- DroidFactory<Link> droidFactory = new DroidFactory<Link>();
- droidFactory.setMap(new HashMap<String, Object>());
+ Droid<Link> droid = DroidsFactory.createSimpleCrawlingDroid(
+ targetURI,
+ testHandler);
+
+ droid.init();
+ droid.start();
+ droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
- SimpleDelayTimer simpleDelayTimer = new SimpleDelayTimer();
- simpleDelayTimer.setDelayMillis(100);
-
- SimpleTaskQueue<Link> simpleQueue = new SimpleTaskQueue<Link>();
-
- MultiThreadedTaskMaster<Link> taskMaster = new MultiThreadedTaskMaster<Link>();
- taskMaster.setMaxThreads( 1 );
- taskMaster.setDelayTimer( simpleDelayTimer );
+ Assert.assertFalse(visitedLinks.isEmpty());
+ Assert.assertEquals(5, visitedLinks.size());
+ Assert.assertEquals(new URL(baseURI + "/start_html"), visitedLinks.get(0));
+ Assert.assertEquals(new URL(baseURI + "/page1_html"), visitedLinks.get(1));
+ Assert.assertEquals(new URL(baseURI + "/page2_html"), visitedLinks.get(2));
+ Assert.assertEquals(new URL(baseURI + "/page3_html"), visitedLinks.get(3));
+ Assert.assertEquals(new URL(baseURI + "/page4_html"), visitedLinks.get(4));
+ }
+
+ @Test
+ public void testTerminateCrawlingOnException() throws Exception
+ {
+ this.testserver.register("*", new ResourceHandler());
+ this.testserver.start();
- CrawlingDroid helloCrawler = new CrawlingDroid( simpleQueue, taskMaster );
- helloCrawler.setFiltersFactory(filtersFactory);
- helloCrawler.setParserFactory(parserFactory);
- helloCrawler.setProtocolFactory(protocolFactory);
- helloCrawler.setHandlerFactory(handlerFactory);
+ String baseURI = "http:/" + this.testserver.getServiceAddress();
+ String targetURI = baseURI + "/start_html";
- Collection<String> initialLocations = new ArrayList<String>();
- initialLocations.add( targetURI );
- helloCrawler.setInitialLocations(initialLocations);
+ final Set<URL> visitedLinks = new HashSet<URL>();
+
+ Handler testHandler = new Handler() {
+
+ public void handle(InputStream openStream, URL url, Parse parse) {
+ visitedLinks.add(url);
+ if (url.getPath().equals("/page3_html")) {
+ throw new RuntimeException("Oppsie!!!");
+ }
+ }
+
+ };
- helloCrawler.init();
- helloCrawler.start();
+ Droid<Link> droid = DroidsFactory.createSimpleCrawlingDroid(
+ targetURI,
+ testHandler);
+
+ SequentialTaskMaster<Link> taskMaster = (SequentialTaskMaster<Link>) droid.getTaskMaster();
+ taskMaster.setExceptionHandler(new TaskExceptionHandler() {
+
+ public TaskExceptionResult handleException(Exception ex) {
+ if (ex instanceof RuntimeException) {
+ return TaskExceptionResult.FATAL;
+ }
+ return TaskExceptionResult.WARN;
+ }
+
+ });
- helloCrawler.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
+ droid.init();
+ droid.start();
+ droid.getTaskMaster().awaitTermination(30, TimeUnit.SECONDS);
Assert.assertFalse(visitedLinks.isEmpty());
- Assert.assertEquals(5, visitedLinks.size());
+ Assert.assertEquals(4, visitedLinks.size());
Assert.assertTrue(visitedLinks.contains(new URL(baseURI + "/start_html")));
Assert.assertTrue(visitedLinks.contains(new URL(baseURI + "/page1_html")));
Assert.assertTrue(visitedLinks.contains(new URL(baseURI + "/page2_html")));
Assert.assertTrue(visitedLinks.contains(new URL(baseURI + "/page3_html")));
- Assert.assertTrue(visitedLinks.contains(new URL(baseURI + "/page4_html")));
}
-
+
}