You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by th...@apache.org on 2008/08/26 14:53:17 UTC
svn commit: r689051 - in /labs/droids/trunk/src/core/java/org/apache/droids:
DefaultWorker.java droids-core-context.xml
droids-core-factories-context.xml parse/html/HtmlParser.java
Author: thorsten
Date: Tue Aug 26 05:53:16 2008
New Revision: 689051
URL: http://svn.apache.org/viewvc?rev=689051&view=rev
Log:
Enhancing the default crawler to save as well images and alike
Modified:
labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java
labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml
labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml
labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
Modified: labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/DefaultWorker.java Tue Aug 26 05:53:16 2008
@@ -62,21 +62,23 @@
sleep(delayTimer.getDelayMillis());
}
setUri(link.getId());
- Core.threadMessage("uri " + getUri());
- setProtocol(protocolFactory.getProtocol(getUri()));
- if (getProtocol().isAllowed(getUri())) {
- String contentType = getProtocol().getContentType(getUri());
+ String url = getUri();
+ Core.threadMessage("url " + url);
+ setProtocol(protocolFactory.getProtocol(url));
+ long workerId = getId();
+ if (getProtocol().isAllowed(url)) {
+ String contentType = getProtocol().getContentType(url);
Core.threadMessage("contentType " + contentType);
parser = parserFactory.getParser(contentType);
// parse contains the outlinks and can be used later
Parse parse = getParse();
handle(parse);
//Core.threadMessage("Trying to shut down "+getId());
- getDroid().finishedWorker(getId());
+ getDroid().finishedWorker(workerId);
} else {
Core.threadMessage("stopping processing since"
+ " bots are not allowed for this url.");
- getDroid().finishedWorker(getId());
+ getDroid().finishedWorker(workerId);
}
} catch (Exception e) {
e.printStackTrace();
@@ -85,8 +87,8 @@
}
protected void handle(Parse parse) throws MalformedURLException, IOException {
- if (null != parse)
- handlerFactory.handle(getProtocol().openStream(getUri()), new URL(getUri()), parse);
+ String url = getUri();
+ handlerFactory.handle(getProtocol().openStream(url), new URL(url), parse);
}
protected Parse getParse() {
Modified: labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/droids-core-context.xml Tue Aug 26 05:53:16 2008
@@ -14,7 +14,7 @@
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:configurator="http://cocoon.apache.org/schema/configurator"
- xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.0.xsd
+ xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.0.1.xsd">
<import resource="droids-core-factories-context.xml"/>
Modified: labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/droids-core-factories-context.xml Tue Aug 26 05:53:16 2008
@@ -3,7 +3,7 @@
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:configurator="http://cocoon.apache.org/schema/configurator"
- xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.0.xsd
+ xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-2.5.xsd
http://cocoon.apache.org/schema/configurator http://cocoon.apache.org/schema/configurator/cocoon-configurator-1.0.1.xsd">
<!-- Core - factories register -->
Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java?rev=689051&r1=689050&r2=689051&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java Tue Aug 26 05:53:16 2008
@@ -53,6 +53,8 @@
public Parse getParse(InputStream stream, Task newLink) {
this.link = newLink;
+ final String id = link.getId();
+ System.out.println("id: "+id);
try {
this.base = new URL(newLink.getId());
} catch (MalformedURLException e1) {
@@ -60,10 +62,10 @@
}
ParseData parseData = null;
// setup filter chain
- XMLDocumentFilter[] filters = { getRemover() };
+ final XMLDocumentFilter[] filters = { getRemover() };
// create HTML parser
- DOMFragmentParser parser = getParser(filters);
- DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ final DOMFragmentParser parser = getParser(filters);
+ final DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
// parse document
// XMLInputSource source = new XMLInputSource(null, uri, uri);
try {
@@ -77,7 +79,7 @@
}
private ParseData extract(DocumentFragment node) {
- ArrayList<Outlink> links = new ArrayList<Outlink>();
+ final ArrayList<Outlink> links = new ArrayList<Outlink>();
try {
extractLinks(node, links, new HashSet<String>());
} catch (MalformedURLException e) {
@@ -89,7 +91,7 @@
}
private DOMFragmentParser getParser(XMLDocumentFilter[] filters) {
- DOMFragmentParser parser = new DOMFragmentParser();
+ final DOMFragmentParser parser = new DOMFragmentParser();
try {
parser.setProperty("http://cyberneko.org/html/properties/filters",
filters);
@@ -112,9 +114,12 @@
private static ElementRemover getRemover() {
// create element remover filter
- ElementRemover remover = new ElementRemover();
+ final ElementRemover remover = new ElementRemover();
// set which elements to accept
remover.acceptElement("a", new String[] { "href" });
+ remover.acceptElement("link", new String[] { "href" });
+ remover.acceptElement("img", new String[] { "src" });
+ remover.acceptElement("script", new String[] { "src" });
// completely remove some elements
remover.removeElement("script");
remover.removeElement("head");
@@ -124,13 +129,14 @@
private void extractLinks(Node node, ArrayList<Outlink> links,
HashSet<String> set) throws MalformedURLException {
if (node.getNodeType() == Node.ELEMENT_NODE) {
- if ("a".equalsIgnoreCase(node.getNodeName())) {
+ if ("a".equalsIgnoreCase(node.getNodeName())||"img".equalsIgnoreCase(node.getNodeName())
+ ||"link".equalsIgnoreCase(node.getNodeName())||"script".equalsIgnoreCase(node.getNodeName()) ) {
NamedNodeMap attrs = node.getAttributes();
String target = null;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
- if (attrName.equalsIgnoreCase("href")) {
+ if (attrName.equalsIgnoreCase("href")||attrName.equalsIgnoreCase("src")) {
target = attr.getNodeValue();
try {
String newUrl = "";
@@ -139,17 +145,19 @@
if(base.getPort()>-1){
newUrl+=":"+base.getPort();
}
- }else{
+ }else if(!target.toLowerCase().startsWith("javascript")){
newUrl=new URL(base, target).toString();
}
- final Outlink outlink = new Outlink(
- target.contains(":/") ? target : newUrl, link.getDepth() + 1);
- log.debug("set size: "+set.size());
- log.debug("outlink.getToUrl(): "+outlink.getToUrl());
- log.debug("set.contains(outlink.getToUrl(): "+set.contains(outlink.getToUrl()));
- if (!set.contains(outlink.getToUrl())) {
- set.add(outlink.getToUrl());
- links.add(outlink);
+ if (!newUrl.equals("")) {
+ final Outlink outlink = new Outlink(
+ target.contains(":/") ? target : newUrl, link.getDepth() + 1);
+ log.debug("set size: "+set.size());
+ log.debug("outlink.getToUrl(): "+outlink.getToUrl());
+ log.debug("set.contains(outlink.getToUrl(): "+set.contains(outlink.getToUrl()));
+ if (!set.contains(outlink.getToUrl())) {
+ set.add(outlink.getToUrl());
+ links.add(outlink);
+ }
}
} catch (Exception e) {
log.fatal(e);
@@ -158,7 +166,7 @@
}
}
}
- NodeList children = node.getChildNodes();
+ final NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org