You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/03/02 15:13:36 UTC
svn commit: r1296216 - in /incubator/any23/trunk/plugins/basic-crawler:
pom.xml src/main/java/org/apache/any23/cli/Crawler.java
src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java
src/test/java/org/apache/any23/cli/CrawlerTest.java
Author: mostarda
Date: Fri Mar 2 14:13:35 2012
New Revision: 1296216
URL: http://svn.apache.org/viewvc?rev=1296216&view=rev
Log:
Migrated to crawler4j version 3.3 . This commit is related to issue #ANY23-37 .
Modified:
incubator/any23/trunk/plugins/basic-crawler/pom.xml
incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java
incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java
Modified: incubator/any23/trunk/plugins/basic-crawler/pom.xml
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/pom.xml?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/pom.xml (original)
+++ incubator/any23/trunk/plugins/basic-crawler/pom.xml Fri Mar 2 14:13:35 2012
@@ -49,39 +49,14 @@
<scope>provided</scope>
</dependency>
- <!-- BEGIN: Crawler4j -->
- <dependency>
- <groupId>edu.uci.ics</groupId>
- <artifactId>crawler4j</artifactId>
- <version>2.6.1</version>
+ <!-- Crawler4j -->
+ <dependency>
+ <groupId>edu.uci.ics</groupId>
+ <artifactId>crawler4j</artifactId>
+ <version>3.3</version>
+ <type>jar</type>
+ <scope>compile</scope>
</dependency>
- <dependency>
- <groupId>com.sleepycat</groupId>
- <artifactId>je</artifactId>
- <version>4.0.92</version>
- </dependency>
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>fastutil</artifactId>
- <version>6.4.1</version>
- </dependency>
- <dependency>
- <groupId>it.unimi.dsi</groupId>
- <artifactId>dsiutils</artifactId>
- <version>2.0.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <version>4.1</version>
- </dependency>
- <!--TODO: resolve dependency conflict.-->
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- <version>1.4</version>
- </dependency>
- <!-- END: Crawler4j -->
</dependencies>
<build>
Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java Fri Mar 2 14:13:35 2012
@@ -18,6 +18,8 @@
package org.apache.any23.cli;
import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.parser.HtmlParseData;
+import edu.uci.ics.crawler4j.parser.ParseData;
import org.apache.any23.plugin.crawler.CrawlerListener;
import org.apache.any23.plugin.crawler.SiteCrawler;
import org.apache.any23.source.StringDocumentSource;
@@ -86,20 +88,25 @@ public class Crawler extends Rover {
public void visitedPage(Page page) {
final String pageURL = page.getWebURL().getURL();
System.err.println( String.format("Processing page: [%s]", pageURL) );
- try {
- synchronized (roverLock) {
- Crawler.super.performExtraction(
- new StringDocumentSource(
- page.getHTML(),
- pageURL
- )
+ final ParseData parseData = page.getParseData();
+ if (parseData instanceof HtmlParseData) {
+ final HtmlParseData htmlParseData = (HtmlParseData) parseData;
+ try {
+ synchronized (roverLock) {
+ Crawler.super.performExtraction(
+ new StringDocumentSource(
+ htmlParseData.getHtml(),
+ pageURL
+
+ )
+ );
+ }
+ } catch (Exception e) {
+ System.err.println(
+ String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
);
}
- } catch (Exception e) {
- System.err.println(
- String.format("Error while processing page [%s], error: %s .", pageURL, e.getMessage())
- );
}
}
});
Modified: incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/main/java/org/apache/any23/plugin/crawler/SiteCrawler.java Fri Mar 2 14:13:35 2012
@@ -17,8 +17,12 @@
package org.apache.any23.plugin.crawler;
+import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
+import edu.uci.ics.crawler4j.fetcher.PageFetcher;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
+import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import java.io.File;
import java.net.URL;
@@ -85,19 +89,9 @@ public class SiteCrawler {
private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
/**
- * Max allowed depth, <code>-1</code> means no limit.
+ * Internal crawler configuration.
*/
- private int maxDepth = -1;
-
- /**
- * Max allowed pages, <code>-1</code> means no limit.
- */
- private int maxPages = -1;
-
- /**
- * Subsequent call politeness delay, <code>-1</code> means no limit.
- */
- private int politenessDelay = -1;
+ private final CrawlConfig crawlConfig;
/**
* Internal executor service.
@@ -111,7 +105,15 @@ public class SiteCrawler {
*/
public SiteCrawler(File storageFolder) {
try {
- controller = new CrawlController( storageFolder.getAbsolutePath() );
+ crawlConfig = new CrawlConfig();
+ crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
+
+ final PageFetcher pageFetcher = new PageFetcher(crawlConfig);
+
+ RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
+ final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
+
+ controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
} catch (Exception e) {
throw new IllegalArgumentException("Error while initializing crawler controller.", e);
}
@@ -139,7 +141,7 @@ public class SiteCrawler {
}
/**
- * Sets the actual crawler clas.
+ * Sets the actual crawler class.
*
* @param c a not <code>class</code>.
*/
@@ -152,7 +154,7 @@ public class SiteCrawler {
* @return the max allowed crawl depth, <code>-1</code> means no limit.
*/
public int getMaxDepth() {
- return maxDepth;
+ return crawlConfig.getMaxDepthOfCrawling();
}
/**
@@ -162,19 +164,14 @@ public class SiteCrawler {
*/
public void setMaxDepth(int maxDepth) {
if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
- if(maxDepth > 0) try {
- controller.setMaximumCrawlDepth(maxDepth);
- } catch (Exception e) {
- throw new IllegalArgumentException("Error while setting maxDepth.", e);
- }
- this.maxDepth = maxDepth;
+ crawlConfig.setMaxDepthOfCrawling(maxDepth);
}
/**
* @return max number of allowed pages.
*/
public int getMaxPages() {
- return maxPages;
+ return crawlConfig.getMaxPagesToFetch();
}
/**
@@ -184,26 +181,23 @@ public class SiteCrawler {
*/
public void setMaxPages(int maxPages) {
if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
- if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
- this.maxPages = maxPages;
+ crawlConfig.setMaxPagesToFetch(maxPages);
}
/**
* @return the politeness delay in milliseconds.
*/
public int getPolitenessDelay() {
- return politenessDelay;
+ return crawlConfig.getPolitenessDelay();
}
/**
- * Sets the politeness delay. <code>-1</code> means no politeness.
+ * Sets the politeness delay.
*
* @param millis delay in milliseconds.
*/
public void setPolitenessDelay(int millis) {
- if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
- if(millis >= 0) controller.setPolitenessDelay(millis);
- this.politenessDelay = millis;
+ if(millis >= 0) crawlConfig.setPolitenessDelay(millis);
}
/**
Modified: incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java?rev=1296216&r1=1296215&r2=1296216&view=diff
==============================================================================
--- incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java (original)
+++ incubator/any23/trunk/plugins/basic-crawler/src/test/java/org/apache/any23/cli/CrawlerTest.java Fri Mar 2 14:13:35 2012
@@ -33,6 +33,7 @@ import java.io.IOException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
/**
* Test case for {@link Crawler} CLI.
@@ -70,6 +71,9 @@ public class CrawlerTest extends Any23On
future.get(10, TimeUnit.SECONDS);
} catch (Exception e) {
// OK.
+ if( ! (e instanceof TimeoutException) ) {
+ e.printStackTrace();
+ }
}
Assert.assertTrue("The output file has not been created.", outFile.exists());