You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by mi...@apache.org on 2009/09/01 22:11:33 UTC
svn commit: r810273 [1/4] - in /incubator/droids/trunk/droids-crawler: ./
docs/ docs/diagrams/ src/ src/main/ src/main/groovy/ src/main/java/
src/main/java/org/ src/main/java/org/apache/
src/main/java/org/apache/droids/ src/main/java/org/apache/droids/...
Author: mingfai
Date: Tue Sep 1 22:11:29 2009
New Revision: 810273
URL: http://svn.apache.org/viewvc?rev=810273&view=rev
Log:
imported initial droids-crawler
Added:
incubator/droids/trunk/droids-crawler/README.txt
incubator/droids/trunk/droids-crawler/docs/
incubator/droids/trunk/droids-crawler/docs/diagrams/
incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.png (with props)
incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.vsd (with props)
incubator/droids/trunk/droids-crawler/docs/diagrams/uml.mdzip (with props)
incubator/droids/trunk/droids-crawler/droids-crawler.iml
incubator/droids/trunk/droids-crawler/pom.xml
incubator/droids/trunk/droids-crawler/src/
incubator/droids/trunk/droids-crawler/src/main/
incubator/droids/trunk/droids-crawler/src/main/groovy/
incubator/droids/trunk/droids-crawler/src/main/java/
incubator/droids/trunk/droids-crawler/src/main/java/org/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Droid.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Entity.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Task.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/TaskMaster.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Worker.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultRejectedExecutionHandler.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultThreadFactory.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerController.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerService.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerWorker.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerController.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerException.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerService.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerWorker.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/LocalCrawlerService.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/RoundRobinCrawlerService.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/StandaloneCrawlerController.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/AbstractExtractor.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/Extractor.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/ExtractorException.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractor.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/AbstractFetcher.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/DefaultFetcherFactory.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/Fetcher.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherException.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/FetcherFactory.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/HttpFetcher.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcher.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/delay/Delay.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/CrawlerHttpClient.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/fetcher/http/HttpHeaderSupport.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/DepthFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FetchFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/Filter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/FilterSupport.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/LinkFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/ParseFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StateFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/StatsFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/extract/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/extract/IncludeFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/extract/RefererFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/fetch/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/fetch/HttpHeaderFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/fetch/LinkAttributeFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/link/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/filter/link/NoRepeatFilter.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/AbstractParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/DefaultParserFactory.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/Parser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/ParserException.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/ParserFactory.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/AbstractHierarchicalDataParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/JerichoHtmlParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/NekoHtmlParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/SAXContentHandler.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/SAXElementParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/parser/impl/StAXElementParser.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/CrawlerExecutorService.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/DefaultLinkQueue.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/LinkMatcher.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/LinkNormalizer.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/ParamUtils.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/URIResolver.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/WeightComparator.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/Weighted.java
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/logging/
incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/util/logging/LogUtils.java
incubator/droids/trunk/droids-crawler/src/main/resources/
incubator/droids/trunk/droids-crawler/src/main/resources/log4j.xml
incubator/droids/trunk/droids-crawler/src/main/resources/sample-crawler-service-cloud.xml
incubator/droids/trunk/droids-crawler/src/test/
incubator/droids/trunk/droids-crawler/src/test/groovy/
incubator/droids/trunk/droids-crawler/src/test/groovy/org/
incubator/droids/trunk/droids-crawler/src/test/groovy/org/apache/
incubator/droids/trunk/droids-crawler/src/test/groovy/org/apache/droids/
incubator/droids/trunk/droids-crawler/src/test/groovy/org/apache/droids/crawler/
incubator/droids/trunk/droids-crawler/src/test/java/
incubator/droids/trunk/droids-crawler/src/test/java/org/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/DependencyTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/DummyCrawlerService.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/LinkTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/LocalCrawlerServiceTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/RoundRobinCrawlerServiceTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/SimpleCrawlerControllerTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/extractor/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/extractor/HtmlElementLinkExtractorTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/fetcher/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/fetcher/HttpFetcherTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/fetcher/appengine/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/fetcher/appengine/AppEngineFetcherTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/filter/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/filter/StatsFilterTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/parser/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/parser/DefaultParserFactoryTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/parser/impl/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/parser/impl/NekoHtmlParserTest.java
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/util/
incubator/droids/trunk/droids-crawler/src/test/java/org/apache/droids/crawler/util/ParamUtilsTest.java
incubator/droids/trunk/droids-crawler/src/test/resources/
incubator/droids/trunk/droids-crawler/src/test/resources/log4j.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-LocalCrawlerServiceTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-RoundRobinCrawlerServiceTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-StandaloneCrawlerControllerTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-extractor.HtmlElementLinkExtractorTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-fetcher.HttpFetcherTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-parser.DefaultParserFactoryTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-parser.impl.NekoHtmlParserTest.xml
incubator/droids/trunk/droids-crawler/src/test/resources/test-util.ParamUtilsTest.xml
Added: incubator/droids/trunk/droids-crawler/README.txt
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/README.txt?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/README.txt (added)
+++ incubator/droids/trunk/droids-crawler/README.txt Tue Sep 1 22:11:29 2009
@@ -0,0 +1 @@
+droids-crawler is under development. The current trunk release is not integrated with droids-core yet.
\ No newline at end of file
Added: incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.png
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.png?rev=810273&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.png
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.vsd
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.vsd?rev=810273&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/droids/trunk/droids-crawler/docs/diagrams/concepts.vsd
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/droids/trunk/droids-crawler/docs/diagrams/uml.mdzip
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/docs/diagrams/uml.mdzip?rev=810273&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/droids/trunk/droids-crawler/docs/diagrams/uml.mdzip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/droids/trunk/droids-crawler/droids-crawler.iml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/droids-crawler.iml?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/droids-crawler.iml (added)
+++ incubator/droids/trunk/droids-crawler/droids-crawler.iml Tue Sep 1 22:11:29 2009
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module relativePaths="true" MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
+ <component name="FacetManager">
+ <facet type="Groovy" name="Groovy">
+ <configuration compile="true" />
+ </facet>
+ <facet type="Spring" name="Spring">
+ <configuration />
+ </facet>
+ </component>
+ <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_5" inherit-compiler-output="false">
+ <output url="file://$MODULE_DIR$/target/classes" />
+ <output-test url="file://$MODULE_DIR$/target/test-classes" />
+ <content url="file://$MODULE_DIR$">
+ <sourceFolder url="file://$MODULE_DIR$/src/main/groovy" isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/main/resources" isTestSource="false" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test/groovy" isTestSource="true" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
+ <sourceFolder url="file://$MODULE_DIR$/src/test/resources" isTestSource="true" />
+ <excludeFolder url="file://$MODULE_DIR$/target" />
+ </content>
+ <orderEntry type="inheritedJdk" />
+ <orderEntry type="sourceFolder" forTests="false" />
+ <orderEntry type="module" module-name="droids-core" exported="" />
+ <orderEntry type="module" module-name="droids-norobots" exported="" />
+ <orderEntry type="library" exported="" name="Maven: commons-logging:commons-logging:1.1.1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.apache.httpcomponents:httpclient:4.0-beta2" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.apache.httpcomponents:httpcore:4.0" level="project" />
+ <orderEntry type="library" exported="" name="Maven: commons-codec:commons-codec:1.3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: nekohtml:nekohtml:1.9.6.2" level="project" />
+ <orderEntry type="library" exported="" name="Maven: xerces:xercesImpl:2.8.1" level="project" />
+ <orderEntry type="library" name="Maven: log4j:log4j:1.2.15" level="project" />
+ <orderEntry type="library" exported="" name="Maven: commons-io:commons-io:1.4" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.codehaus.groovy:groovy-all:1.6.3" level="project" />
+ <orderEntry type="library" name="Maven: junit:junit:4.5" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.apache.ant:ant:1.7.1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.apache.ant:ant-launcher:1.7.1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: jline:jline:0.9.94" level="project" />
+ <orderEntry type="library" exported="" name="Maven: net.sourceforge.nekohtml:nekohtml:1.9.12" level="project" />
+ <orderEntry type="library" exported="" name="Maven: net.htmlparser.jericho:jericho-html:3.1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: net.jcip:jcip-annotations:1.0" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-core:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-asm:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-web:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: com.caucho:hessian:3.1.5" level="project" />
+ <orderEntry type="library" exported="" name="Maven: com.caucho:burlap:2.1.12" level="project" />
+ <orderEntry type="library" exported="" name="Maven: aopalliance:aopalliance:1.0" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-aop:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-beans:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-context:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-expression:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.antlr:antlr:3.0.1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.antlr:stringtemplate:3.1-b1" level="project" />
+ <orderEntry type="library" exported="" name="Maven: antlr:antlr:2.7.7" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-oxm:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.springframework:spring-webmvc:3.0.0.M3" level="project" />
+ <orderEntry type="library" exported="" name="Maven: org.aspectj:aspectjweaver:1.6.2" level="project" />
+ <orderEntry type="library" exported="" name="Maven: com.google.appengine:appengine-api-1.0-sdk:1.2.1" level="project" />
+ </component>
+</module>
+
Added: incubator/droids/trunk/droids-crawler/pom.xml
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/pom.xml?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/pom.xml (added)
+++ incubator/droids/trunk/droids-crawler/pom.xml Tue Sep 1 22:11:29 2009
@@ -0,0 +1,276 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <!--<parent>
+ <artifactId>droids</artifactId>
+ <groupId>org.apache.droids</groupId>
+ <version>0.1-incubating-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>-->
+ <groupId>org.apache.droids</groupId>
+ <version>0.1</version>
+
+ <artifactId>droids-crawler</artifactId>
+ <name>Droids Crawler Components</name>
+ <inceptionYear>2009</inceptionYear>
+ <description>Droids Crawler Components</description>
+ <packaging>jar</packaging>
+
+ <dependencies>
+ <!-- DROIDS DEPENDENCIES -->
+ <!-- TODO start with no dependency first. to incl core in a later stage-->
+ <dependency>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids-core</artifactId>
+ <version>0.1-incubating-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <!--TODO: sync with parent, remove this -->
+ <version>1.1.1</version>
+ </dependency>
+ <dependency>
+ <!--TODO: sync with parent, remove this -->
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <version>1.2.15</version>
+ <scope>provided</scope>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.mail</groupId>
+ <artifactId>mail</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>javax.jms</groupId>
+ <artifactId>jms</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jdmk</groupId>
+ <artifactId>jmxtools</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jmx</groupId>
+ <artifactId>jmxri</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <!-- TODO sync version -->
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codehaus.groovy</groupId>
+ <artifactId>groovy-all</artifactId>
+ <version>1.6.3</version>
+ </dependency>
+ <dependency>
+ <!--TODO: sync with parent -->
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpcore</artifactId>
+ <version>4.0</version>
+ </dependency>
+ <dependency>
+ <!--TODO: sync with parent -->
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>4.0-beta2</version>
+ </dependency>
+ <dependency>
+ <!--TODO: sync with parent -->
+ <groupId>net.sourceforge.nekohtml</groupId>
+ <artifactId>nekohtml</artifactId>
+ <version>1.9.12</version>
+ <!--<scope>compile</scope>-->
+ <exclusions>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>net.htmlparser.jericho</groupId>
+ <artifactId>jericho-html</artifactId>
+ <version>3.1</version>
+ </dependency>
+ <dependency>
+ <!-- for threadsafe annotation-->
+ <groupId>net.jcip</groupId>
+ <artifactId>jcip-annotations</artifactId>
+ <version>1.0</version>
+ </dependency>
+
+
+ <!-- Spring 3.0 -->
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-core</artifactId>
+ <version>3.0.0.M3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-web</artifactId>
+ <version>3.0.0.M3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-webmvc</artifactId>
+ <version>3.0.0.M3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.springframework</groupId>
+ <artifactId>spring-aop</artifactId>
+ <version>3.0.0.M3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.aspectj</groupId>
+ <artifactId>aspectjweaver</artifactId>
+ <version>1.6.2</version>
+ </dependency>
+
+ <!-- AppEngine -->
+ <dependency>
+ <groupId>com.google.appengine</groupId>
+ <artifactId>appengine-api-1.0-sdk</artifactId>
+ <version>1.2.1</version>
+ <scope>compile</scope>
+ </dependency>
+
+ <!-- test -->
+ <dependency>
+ <!-- TODO sync version -->
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ <version>4.5</version>
+ </dependency>
+
+ </dependencies>
+
+ <build>
+ <resources>
+ <resource>
+ <filtering>false</filtering>
+ <directory>src/main/resources</directory>
+ </resource>
+ <resource>
+ <filtering>false</filtering>
+ <directory>src/main/java</directory>
+ <includes>
+ <include>**</include>
+ </includes>
+ <excludes>
+ <exclude>**/*.java</exclude>
+ </excludes>
+ </resource>
+ </resources>
+ <testResources>
+ <testResource>
+ <filtering>false</filtering>
+ <directory>src/test/java</directory>
+ <includes>
+ <include>**</include>
+ </includes>
+ <excludes>
+ <exclude>**/*.java</exclude>
+ </excludes>
+ </testResource>
+ <testResource>
+ <filtering>false</filtering>
+ <directory>src/test/resources</directory>
+ <includes>
+ <include>*.xml</include>
+ </includes>
+ </testResource>
+ </testResources>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <configuration>
+ <source>1.5</source>
+ <target>1.5</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-antrun-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>compile</id>
+ <phase>compile</phase>
+ <configuration>
+ <tasks>
+ <taskdef name="groovyc" classname="org.codehaus.groovy.ant.Groovyc">
+ <classpath refid="maven.compile.classpath"/>
+ </taskdef>
+ <mkdir dir="${project.build.outputDirectory}"/>
+ <groovyc destdir="${project.build.outputDirectory}"
+ srcdir="${basedir}/src/main/groovy/" listfiles="true">
+ <classpath refid="maven.compile.classpath"/>
+ </groovyc>
+ </tasks>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>test-compile</id>
+ <phase>test-compile</phase>
+ <configuration>
+ <tasks>
+ <taskdef name="groovyc" classname="org.codehaus.groovy.ant.Groovyc">
+ <classpath refid="maven.compile.classpath"/>
+ </taskdef>
+ <mkdir dir="${project.build.testOutputDirectory}"/>
+ <groovyc destdir="${project.build.testOutputDirectory}"
+ srcdir="${basedir}/src/test/groovy/" listfiles="true">
+ <classpath refid="maven.test.classpath"/>
+ </groovyc>
+ </tasks>
+ </configuration>
+ <goals>
+ <goal>run</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <version>2.4.3</version>
+ <configuration>
+ <includes>
+ <include>**/*Test.java</include>
+ </includes>
+ <forkMode>once</forkMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Droid.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Droid.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Droid.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Droid.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core;
+
+import org.apache.droids.exception.DroidsException;
+
+public interface Droid<T extends Task>{ //TODO use Droid from core and remove this dummy interface
+
+ void init() throws DroidsException;
+
+ boolean isStarted();
+
+ void start();
+
+ Worker getNewWorker();
+
+ TaskMaster getTaskMaster();
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Entity.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Entity.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Entity.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Entity.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core;
+
+import org.apache.http.Header;
+import org.apache.http.HttpEntity;
+import org.apache.http.message.BasicHeader;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.util.EntityUtils;
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+
+import java.io.*;
+
+/**
+ * A HttpEntity implementation that is serializable, use byte[] as buffer, and support an additional field "charset" and
+ * may potentially add with other fields required by the Crawler.
+ * <p/>
+ * It is named Entity rather than HttpEntity to indicate the Crawler may support more than HTTP crawling.
+ * TODO consider to make it extends HashMap like the Link
+ */
+public class Entity implements Serializable{
+ static Log log = LogFactory.getLog(Entity.class);
+ private static final long serialVersionUID = 1L;
+ protected String contentType;
+ protected String contentEncoding;
+ protected String charset;//TODO review this
+ protected final byte[] content;
+
+ public Entity(byte[] input){
+ this.content = input;
+ }
+
+ public Entity(HttpEntity httpEntity) throws IOException{
+ this.content = EntityUtils.toByteArray(httpEntity);
+ this.contentEncoding = httpEntity.getContentEncoding() != null ? httpEntity.getContentEncoding().getValue() : null;
+ this.contentType = httpEntity.getContentType() != null ? httpEntity.getContentType().getValue() : null;
+ }
+
+ /*public Entity(InputStream inputStream, long size) throws IOException{
+ this.httpEntity = new BufferedHttpEntity(new InputStreamEntity(inputStream, size));
+ }*/
+
+ @Override public String toString(){
+ StringBuilder out = new StringBuilder();
+ out.append("Entity ( ");
+ out.append("content.length: ").append(content.length).append(", ");
+ out.append("contentType: ").append(contentType).append(", ");
+ out.append("contentEncoding: ").append(contentEncoding).append(", ");
+ out.append("charset: ").append(charset);
+ out.append(" )");
+ return out.toString();
+ }
+
+
+ public boolean isRepeatable(){return true;}
+
+ public boolean isChunked(){
+ throw new UnsupportedOperationException("unimplemneted yet");
+ }
+
+ public long getContentLength(){ return content.length; }
+
+ /*public Header getContentType(){ return new BasicHeader(HTTP.CONTENT_TYPE, contentType); }
+
+ public Header getContentEncoding(){ return new BasicHeader(HTTP.CONTENT_ENCODING, contentEncoding); }*/
+
+ public InputStream getContent(){ return new ByteArrayInputStream(this.content); }
+
+ public void writeTo(OutputStream outputStream) throws IOException{ outputStream.write(this.content); }
+
+ public boolean isStreaming(){ return false; }
+
+ public void consumeContent() throws IOException{}
+
+ public String getContentType(){
+ return contentType;
+ }
+
+ public String getContentEncoding(){
+ return contentEncoding;
+ }
+
+ public void setContentType(String contentType){
+ this.contentType = contentType;
+ }
+
+ public void setContentEncoding(String contentEncoding){
+ this.contentEncoding = contentEncoding;
+ }
+
+ public String getCharset(){
+ return charset;
+ }
+
+ public void setCharset(String charset){
+ this.charset = charset;
+ }
+
+
+ public byte[] getBytes(){
+ return this.content;
+ }
+
+ /**
+ * Get content as String
+ */
+ public String getText(){
+ try{
+ return getContentEncoding() != null ? new String(this.content, getContentEncoding()) : new String(this.content);
+ } catch (UnsupportedEncodingException e){
+ log.error("getText() - error - e: " + e, e);
+ return null;
+ }
+
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Task.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Task.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Task.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Task.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core;
+
+/**
+ * TODO should id be long or String? see http://issues.apache.org/jira/browse/DROIDS-53
+ */
+public interface Task{
+ long getId();
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/TaskMaster.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/TaskMaster.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/TaskMaster.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/TaskMaster.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core;
+
+import java.util.Queue;
+import java.util.concurrent.ExecutorService;
+
+
+public interface TaskMaster<T extends Task>{
+
+ ExecutorService getExecutorService();
+
+ void processAllTasks(final Queue<T> queue, final Droid<T> droid);
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Worker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Worker.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Worker.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/Worker.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core;
+
+import org.apache.droids.crawler.Link;
+
+public interface Worker<T extends Link> extends Runnable{
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultRejectedExecutionHandler.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultRejectedExecutionHandler.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultRejectedExecutionHandler.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultRejectedExecutionHandler.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core.thread;
+
+import org.apache.commons.logging.LogFactory;
+import org.apache.commons.logging.Log;
+
+import java.util.concurrent.RejectedExecutionHandler;
+import java.util.concurrent.ThreadPoolExecutor;
+
+public class DefaultRejectedExecutionHandler implements RejectedExecutionHandler{
+ static Log log = LogFactory.getLog(DefaultRejectedExecutionHandler.class);
+
+ public void rejectedExecution(Runnable r, ThreadPoolExecutor executor){
+ log.warn("rejectedExecution() - runnable: " + r + ", threadPoolExecutor: " + executor);
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultThreadFactory.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultThreadFactory.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultThreadFactory.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/core/thread/DefaultThreadFactory.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.core.thread;
+
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicInteger;
+
+public class DefaultThreadFactory implements ThreadFactory{
+ final ThreadGroup threadGroup = new ThreadGroup(System.getSecurityManager() != null ?
+ System.getSecurityManager().getThreadGroup() : Thread.currentThread().getThreadGroup(),
+ "droids");
+ AtomicInteger threadNumber = new AtomicInteger(1);
+
+ public Thread newThread(Runnable r){
+ return new Thread(threadGroup, r, "droids-" + threadNumber.getAndIncrement(), 0);
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerController.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerController.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerController.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerController.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.filter.LinkFilter;
+import org.apache.droids.crawler.util.DefaultLinkQueue;
+import org.apache.droids.exception.DroidsException;
+import org.apache.droids.core.Droid;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.context.ApplicationContext;
+
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+
+public abstract class AbstractCrawlerController<T extends Link> implements CrawlerController<T>{
+ protected static final Log log = LogFactory.getLog(AbstractCrawlerController.class);
+ @Autowired protected ApplicationContext applicationContext;
+ protected Queue<T> queue;
+ @Autowired(required = false) protected List<LinkFilter<T>> filters;
+ @Autowired(required = false) @Qualifier("crawler.filterComparator") protected Comparator filterComparator;
+ @Autowired(required = false) protected CrawlerService<T> crawlerService;
+ protected Collection seeds;
+
+ public void init() throws DroidsException{
+ if (queue == null) queue = new DefaultLinkQueue<T>();
+ if (filters != null && filterComparator != null) Collections.sort(filters, filterComparator);
+ if (seeds != null){
+ int counter = 0;
+ long current = System.currentTimeMillis();
+ Map<String, Serializable> seedLinkData = new HashMap<String, Serializable>();
+ for (Object seed : seeds){
+ seedLinkData.put("created", current + counter++);
+ try{
+ if (seed instanceof Link) queue.add((T) seed);
+ else if (seed instanceof String) queue.add((T) new Link((String) seed, seedLinkData));
+ else if (seed instanceof URI) queue.add((T) new Link((URI) seed, seedLinkData));
+ } catch (URISyntaxException e){
+ log.error("fail to construct a seed link, skipped - seed: " + seed, e);
+ }
+ }
+ }
+ }
+
+ public boolean addFilter(LinkFilter<T> filter){
+ if (this.filters == null) this.filters = new ArrayList<LinkFilter<T>>();
+ return this.filters.add(filter);
+ }
+
+ public boolean removeFilter(LinkFilter<T> filter){
+ return this.filters != null && this.filters.remove(filter);
+ }
+
+ public void setFilters(List<LinkFilter<T>> filters){
+ this.filters = filters;
+ }
+
+ public List<LinkFilter<T>> getFilters(){
+ return filters;
+ }
+
+ public boolean hasFilter(){
+ return filters != null && filters.size() > 0;
+ }
+
+ public Queue<T> getQueue(){
+ return queue;
+ }
+
+ public void setQueue(Queue<T> queue){
+ this.queue = queue;
+ }
+
+ public Comparator getFilterComparator(){
+ return filterComparator;
+ }
+
+ public void setFilterComparator(Comparator filterComparator){
+ this.filterComparator = filterComparator;
+ }
+
+ public CrawlerService<T> getCrawlerService(){
+ return crawlerService;
+ }
+
+ public void setCrawlerService(CrawlerService<T> crawlerService){
+ this.crawlerService = crawlerService;
+ }
+
+ public Collection getSeeds(){
+ return seeds;
+ }
+
+ public void setSeeds(Collection seeds){
+ this.seeds = seeds;
+ }
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerService.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerService.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerService.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerService.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.filter.LinkFilter;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+public abstract class AbstractCrawlerService<T extends Link> implements CrawlerService<T>{
+ static Log log = LogFactory.getLog(AbstractCrawlerService.class);
+
+ /**
+ * Distributed Crawler Service shall override this method if it is not one instance per host.
+ */
+ public String getNode(){
+ try{
+ return InetAddress.getLocalHost().getHostName();
+ } catch (UnknownHostException e){
+ log.error(e);
+ return "null";
+ }
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerWorker.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerWorker.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/AbstractCrawlerWorker.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.core.Worker;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.filter.LinkFilter;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+public abstract class AbstractCrawlerWorker<T extends Link> implements Worker, FilterSupport<LinkFilter<T>>{
+ static Log log = LogFactory.getLog(AbstractCrawlerWorker.class);
+ @Autowired(required = false) protected List<LinkFilter<T>> filters;
+
+ protected T polled(T link){
+ if (hasFilter()){
+ for (LinkFilter<T> filter : getFilters()){
+ link = filter.polled(link);
+ if (log.isTraceEnabled())
+ log.trace("polled - filter: " + filter + ", link: " + link);
+ }
+ }
+ if (link != null) link.setState(Link.State.POLLED);
+ return link;
+ }
+
+ /*public void completed(T link, final Fetcher<T> fetcher, final Parser<T, Object> parser, final List<Extractor<T, ? extends Parser>> extractors, final Set<T> links);
+ public void failed(T link, int stage, Object object);
+ */
+ protected T completed(T link, Set<T> outlinks){
+ if (hasFilter()){
+ for (LinkFilter<T> filter : getFilters()){
+ filter.completed(link, outlinks);
+ if (log.isTraceEnabled()) log.trace("completed - filter: " + filter + ", link: " + link);
+ }
+ }
+ if (link != null) link.setState(Link.State.COMPLETED);
+ return link;
+ }
+
+ protected T failed(T link, Object object){
+ if (hasFilter()){
+ for (LinkFilter<T> filter : getFilters()){
+ filter.failed(link, object);
+ if (log.isTraceEnabled()) log.trace("completed - filter: " + filter + ", link: " + link);
+ }
+ }
+ if (link != null) link.setState(Link.State.FAILED);
+ return link;
+ }
+
+
+ public boolean addFilter(LinkFilter<T> filter){
+ if (filters == null) filters = new ArrayList<LinkFilter<T>>();
+ return filters.add(filter);
+ }
+
+ public boolean removeFilter(LinkFilter<T> filter){
+ return filters.remove(filter);
+ }
+
+ public void setFilters(List<LinkFilter<T>> filters){
+ this.filters = filters;
+ }
+
+ public List<LinkFilter<T>> getFilters(){
+ return filters;
+ }
+
+ public boolean hasFilter(){
+ return filters != null && filters.size() > 0;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerController.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerController.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerController.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerController.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.droids.crawler;
+
+import org.apache.droids.core.Droid;
+import org.apache.droids.crawler.filter.FilterSupport;
+import org.apache.droids.crawler.filter.LinkFilter;
+
+import java.util.Queue;
+
+public interface CrawlerController<T extends Link> extends FilterSupport<LinkFilter<T>>{//TODO make it extends Droid
+
+ Queue<T> getQueue();
+
+ CrawlerService<T> getCrawlerService();
+
+ void start() throws CrawlerException;
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerException.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerException.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerException.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerException.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+public class CrawlerException extends Exception {
+ public CrawlerException() {
+ }
+
+ public CrawlerException(String message) {
+ super(message);
+ }
+
+ public CrawlerException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public CrawlerException(Throwable cause) {
+ super(cause);
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerService.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerService.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerService.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerService.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.droids.crawler;
+
+import org.apache.droids.crawler.parser.ParserException;
+import org.apache.droids.crawler.fetcher.FetcherException;
+import org.apache.droids.crawler.extractor.ExtractorException;
+
+/**
+ * Crawler Service is a Facade of Fetcher, Parser and Extractor. It uses Link (that extends Map) as the main data
+ * container for every process. "fetched", "parsed" and "extracted" attributes are added to the Link after each step.
+ */
+public interface CrawlerService<T extends Link>{
+ static final long DEFAULT_SERIALVERSIONID = 1L;
+
+ String getVersion();
+
+ String getNode();
+
+ /**
+ * fetch, to specify Fetcher and Parser, use link attribute according to @org.apache.droids.crawler.Link
+ *
+ * @param link
+ * @return a Link with "fetched" attribute of @org.apache.droids.crawler.fetcher.Fetcher type
+ */
+ T fetch(T link) throws FetcherException;
+
+ /**
+ * fetch and parse, to specify Fetcher and Parser, use link attribute according to @org.apache.droids.crawler.Link
+ *
+ * @return a Link with "fetch.fetcher" attribute of @org.apache.droids.crawler.fetcher.Fetcher type, and
+ * "parsed" attribute of @org.apache.droids.crawler.parser.Parser type
+ */
+ T parse(T link) throws ParserException;
+
+ T extract(T link) throws ExtractorException;
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerWorker.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerWorker.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerWorker.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/CrawlerWorker.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import java.util.Queue;
+import java.util.Set;
+
+public class CrawlerWorker<T extends Link> extends AbstractCrawlerWorker<T>{
+ static Log log = LogFactory.getLog(CrawlerWorker.class);
+ protected Queue<T> queue;
+ @Autowired(required = false) protected CrawlerService<T> crawlerService;
+ protected int count = 0;
+
+ public void run(){
+ T link = null;
+ try{
+ link = queue.poll();
+ if (log.isTraceEnabled()) log.trace("run() - polled link: " + link + ", crawlerService: " + crawlerService);
+ if (link != null){
+ link = crawlerService.extract(link);
+ if (log.isTraceEnabled()) log.trace("run() - extracted - link: " + link);
+ if (link.containsKey("extracted")){
+ Set<T> outlinks = link.remove("extracted", Set.class);
+ this.completed(link, outlinks);
+ count++;
+ }
+ } else{
+ log.warn("run() - fail to poll any link - queue: " + queue.toString());
+ }
+ } catch (Exception e){
+ log.error("run() - error", e);
+ this.failed(link, e);
+ }
+ }
+
+ public Queue<T> getQueue(){
+ return queue;
+ }
+
+ public void setQueue(Queue<T> queue){
+ this.queue = queue;
+ }
+
+ public CrawlerService<T> getCrawlerService(){
+ return crawlerService;
+ }
+
+ public void setCrawlerService(CrawlerService<T> crawlerService){
+ this.crawlerService = crawlerService;
+ }
+
+ public int getCount(){
+ return count;
+ }
+
+ public void setCount(int count){
+ this.count = count;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/Link.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.droids.crawler.util.URIResolver;
+import org.apache.droids.crawler.util.logging.LogUtils;
+
+import java.io.Serializable;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Link is the main unit for any crawling process including every step and every filter method.
+ * As a Map, it holds any arbitary data.
+ * <p/>
+ * By design, any method operates with a link uses Link as the first argument.
+ */
+public class Link extends HashMap<String, Object> implements Serializable{
+ public static enum State{
+ INIT, POLLED, FETCHED, PARSED, EXTRACTED, COMPLETED, FAILED
+ }
+
+ protected long id;
+ protected String url;
+ protected State state = State.INIT;
+
+ public Link(){
+ }
+
+ public Link(URI uri){
+ this(uri, null);
+ }
+
+ public Link(String url) throws URISyntaxException{
+ this(new URI(url), null);
+ }
+
+ public Link(String url, Map<String, ?> initData) throws URISyntaxException{
+ this(new URI(url), initData);
+ }
+
+ public Link(URI uri, Map<String, ?> initData){
+ this.url = uri.toString();
+ this.id = createId(this.url);
+ if (initData != null){
+ for (String key : initData.keySet()) this.put(key, initData.get(key));
+ }
+ }
+
+ protected long createId(String uri){
+ long id = uri.hashCode();
+ id = (id << 32) >>> 32;
+ return id;
+ }
+
+ public String toString(){
+ StringBuilder out = new StringBuilder();
+ out.append("Link ( ").append("url: " + LogUtils.toString(url));
+ out.append(", id: " + LogUtils.toString(id));
+ out.append(", state: " + state);
+ if (this.size() > 0) out.append(", data: ").append(LogUtils.toString(this));
+ out.append(" )");
+ return out.toString();
+ }
+
+ @Override
+ public boolean equals(Object o){
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ if (!super.equals(o)) return false;
+
+ Link link = (Link) o;
+
+ if (id != link.id) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode(){
+ int result = 0;
+ result = 31 * result + (int) (id ^ (id >>> 32));
+ result = 31 * result + (url != null ? url.hashCode() : 0);
+ return result;
+ }
+
+ public <T> T get(String key, Class<T> clazz){
+ return (T) this.get(key);
+ }
+
+ public <T> T remove(String key, Class<T> clazz){
+ return (T) this.remove(key);
+ }
+
+ public String getHost(){
+ try{
+ return new URI(url).getHost();
+ } catch (URISyntaxException e){
+ // it shall not happen, new URI() is validated in constructor
+ return null;
+ }
+ }
+
+ public void setUrl(String url) throws URISyntaxException{
+ this.url = new URI(url).toString();
+ }
+
+ public String getUrl(){
+ return url;
+ }
+
+ public Link resolve(String target){
+ URI resolved = null;
+ try{
+ resolved = new URIResolver(new URI(url)).resolve(target);
+ return resolved != null ? new Link(resolved) : null;
+ } catch (URISyntaxException e){
+ // it shall not happen, new URI() is validated in constructor
+ return null;
+ }
+ }
+
+ public long getId(){
+ return id;
+ }
+
+ public int getDepth(){
+ throw new UnsupportedOperationException("please use get(\"depth\")");
+ }
+
+ public Date getTaskDate(){
+ return null; //To change body of implemented methods use File | Settings | File Templates.
+ }
+
+ public URI getURI(){
+ try{
+ return new URI(url);
+ } catch (URISyntaxException e){
+ // it shall not happen, new URI() is validated in constructor
+ return null;
+ }
+ }
+
+ public State getState(){
+ return state;
+ }
+
+ public void setState(State state){
+ this.state = state;
+ }
+
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/LocalCrawlerService.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/LocalCrawlerService.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/LocalCrawlerService.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/LocalCrawlerService.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.droids.crawler.extractor.Extractor;
+import org.apache.droids.crawler.extractor.ExtractorException;
+import org.apache.droids.crawler.fetcher.Fetcher;
+import org.apache.droids.crawler.fetcher.FetcherException;
+import org.apache.droids.crawler.fetcher.FetcherFactory;
+import org.apache.droids.crawler.filter.LinkFilter;
+import org.apache.droids.crawler.filter.FetchFilter;
+import org.apache.droids.crawler.parser.Parser;
+import org.apache.droids.crawler.parser.ParserException;
+import org.apache.droids.crawler.parser.ParserFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+
+import javax.annotation.PostConstruct;
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+public class LocalCrawlerService<T extends Link> extends AbstractCrawlerService<T>{
+ static Log log = LogFactory.getLog(LocalCrawlerService.class);
+ @Autowired(required = false) protected FetcherFactory<T> fetcherFactory;
+ @Autowired(required = false) protected ParserFactory<T> parserFactory;
+ @Autowired(required = false) protected List<Extractor<T, Parser<T, ?>>> extractors;
+
+ @PostConstruct
+ public LocalCrawlerService<T> init(){
+ if (log.isInfoEnabled()){
+ log.info("init() - fetcherFactory: " + fetcherFactory);
+ log.info("\t\tparserFactory: " + parserFactory);
+ log.info("\t\textractors: " + ((extractors != null) ? extractors.size() : -1) + ", extractors: " + extractors);
+ }
+ return this;
+ }
+
+ public String getVersion(){return "0.1";}
+
+ public T fetch(T link) throws FetcherException{
+ if (log.isTraceEnabled()) log.trace("fetch() - link: " + link);
+ try{
+ Fetcher fetcher = fetcherFactory.newFetcher(link);
+ if (fetcher == null)
+ throw new FetcherException("no supported fetcher - link: " + link + ", fetcherFactory: " + fetcherFactory);
+ fetcher.fetch(link, link.get("params", Map.class));
+ link.put("fetched", fetcher.getStatusCode() == Fetcher.SUCCESS ? fetcher : false);
+ return link;
+ } catch (Exception e){
+ log.error("fetch() - error - link: " + link + ", error: " + e.getMessage());
+ throw new FetcherException("fail to fetch", e);
+ }
+ }
+
+ public T parse(T link) throws ParserException{
+ if (log.isTraceEnabled()) log.trace("parse() - link: " + link);
+ Fetcher fetcher = link.remove("fetched", Fetcher.class);
+
+ if (fetcher == null){
+ try{
+ fetcher = this.fetch(link).remove("fetched", Fetcher.class);
+ } catch (FetcherException e){
+ throw new ParserException("failed in auto-fetch", e);
+ } finally{
+ if (fetcher == null)
+ throw new IllegalStateException("link is not fetched - link: " + link);
+ }
+ }
+
+ if (log.isTraceEnabled()) log.trace("parse() - fetcher: " + fetcher + ", parserFactory: " + parserFactory);
+ Parser parser = parserFactory.newParser(link);
+ parser.parse(link, fetcher.getEntity(), null);
+ link.put("parsed", parser);
+ return link;
+ }
+
+ public T extract(T link) throws ExtractorException{
+ if (log.isTraceEnabled())
+ log.trace("extract() - link: " + link + ", extractors.size(): " + (extractors != null ? extractors.size() : -1) + ", extractors: " + extractors);
+ try{
+ if (!link.containsKey("parsed")){
+ if (!link.containsKey("fetched")) link = fetch(link);
+ link = parse(link);
+ }
+ } catch (FetcherException e){
+ throw new ExtractorException("failed in auto fetch", e);
+ } catch (ParserException e){
+ throw new ExtractorException("failed in auto parse", e);
+ }
+ if (!link.containsKey("parsed")) throw new ExtractorException("fail to auto fetch/parse - link: " + link);
+
+ Set<T> outlinks = new HashSet<T>();
+ Parser<T, ?> parser = link.remove("parsed", Parser.class);
+ if (parser == null) throw new ExtractorException("link must have a \"parsed\" attribute");
+
+ if (extractors == null){
+ log.warn("extract() - no extractor is configured");
+ return link;
+ } else{
+ for (Extractor<T, Parser<T, ?>> extractor : extractors){
+ if (extractor.matches(link)){
+ Set<T> outlink1 = extractor.extract(link, parser);
+ if (outlink1 != null && outlink1.size() > 0) outlinks.addAll(outlink1);
+ }
+ }
+ }
+ if (outlinks != null && outlinks.size() > 0) link.put("extracted", (Serializable) outlinks);
+ return link;
+
+ }
+
+ public FetcherFactory<T> getFetcherFactory(){
+ return fetcherFactory;
+ }
+
+ public void setFetcherFactory(FetcherFactory<T> fetcherFactory){
+ this.fetcherFactory = fetcherFactory;
+ }
+
+ public ParserFactory<T> getParserFactory(){
+ return parserFactory;
+ }
+
+ public void setParserFactory(ParserFactory<T> parserFactory){
+ this.parserFactory = parserFactory;
+ }
+
+ public List<Extractor<T, Parser<T, ?>>> getExtractors(){
+ return extractors;
+ }
+
+ public void setExtractors(List<Extractor<T, Parser<T, ?>>> extractors){
+ this.extractors = extractors;
+ }
+}
Added: incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/RoundRobinCrawlerService.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/RoundRobinCrawlerService.java?rev=810273&view=auto
==============================================================================
--- incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/RoundRobinCrawlerService.java (added)
+++ incubator/droids/trunk/droids-crawler/src/main/java/org/apache/droids/crawler/RoundRobinCrawlerService.java Tue Sep 1 22:11:29 2009
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.crawler;
+
+import org.springframework.beans.factory.annotation.Autowired;
+import org.apache.droids.crawler.fetcher.FetcherException;
+import org.apache.droids.crawler.parser.ParserException;
+import org.apache.droids.crawler.extractor.ExtractorException;
+
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Delegate to the list of configured CrawlerService.
+ * <p/>
+ * One of the most useful use case is to configure a Crawler Controller to use a RoundRobinCrawlerService with
+ * multiple remote CrawlerService distributed in the cloud.
+ */
+public class RoundRobinCrawlerService<T extends Link> extends AbstractCrawlerService<T>{
+ @Autowired(required = false) List<CrawlerService<T>> crawlerServices;
+ protected AtomicInteger counter = new AtomicInteger();
+
+ public String getVersion(){return "0.1";}
+
+ public T fetch(T link) throws FetcherException{
+ return crawlerServices.get(counter.getAndIncrement() % crawlerServices.size()).fetch(link);
+ }
+
+ public T parse(T link) throws ParserException{
+ return crawlerServices.get(counter.getAndIncrement() % crawlerServices.size()).parse(link);
+ }
+
+ public T extract(T link) throws ExtractorException{
+ return crawlerServices.get(counter.getAndIncrement() % crawlerServices.size()).extract(link);
+ }
+
+ public List<CrawlerService<T>> getCrawlerServices(){
+ return crawlerServices;
+ }
+
+ public void setCrawlerServices(List<CrawlerService<T>> crawlerServices){
+ this.crawlerServices = crawlerServices;
+ }
+}