You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/04/10 11:41:56 UTC
[nutch] branch master updated: NUTCH-2775 Fetcher to guarantee
minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed
minimum delay is configured by `fetcher.min.crawl.delay` (default set equal
to `fetcher.server.delay`)
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new e6bc451 NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed minimum delay is configured by `fetcher.min.crawl.delay` (default set equal to `fetcher.server.delay`)
new e6d3e57 Merge pull request #506 from sebastian-nagel/NUTCH-2775-robots-min-delay
e6bc451 is described below
commit e6bc45181369ced98fa7d9df23685620938b0c9c
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Wed Mar 25 10:32:36 2020 +0100
NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay
- guaranteed minimum delay is configured by `fetcher.min.crawl.delay`
(default set equal to `fetcher.server.delay`)
---
conf/nutch-default.xml | 12 ++++++++++++
src/java/org/apache/nutch/fetcher/FetcherThread.java | 17 ++++++++++++++---
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 85d9933..6dfbe64 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -959,6 +959,18 @@
</property>
<property>
+ <name>fetcher.min.crawl.delay</name>
+ <value>${fetcher.server.delay}</value>
+ <description>
+ Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the
+ robots.txt specifies a shorter delay. By default the minimum Crawl-Delay
+ is set to the value of `fetcher.server.delay` which guarantees that
+ a value set in the robots.txt cannot make the crawler more aggressive
+ than the default configuration.
+ </description>
+</property>
+
+<property>
<name>fetcher.threads.fetch</name>
<value>10</value>
<description>The number of FetcherThreads the fetcher should use.
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 5d5a20b..549cd36 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -85,6 +85,7 @@ public class FetcherThread extends Thread {
private URLNormalizers normalizers;
private ProtocolFactory protocolFactory;
private long maxCrawlDelay;
+ private long minCrawlDelay;
private String queueMode;
private int maxRedirect;
private boolean maxRedirectExceededSkip = false;
@@ -165,6 +166,9 @@ public class FetcherThread extends Thread {
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
+ float crawlDelay = conf.getFloat("fetcher.server.delay", 1.0f);
+ this.minCrawlDelay = (long) (conf.getFloat("fetcher.min.crawl.delay",
+ crawlDelay) * 1000);
this.activeThreads = activeThreads;
this.fetchQueues = fetchQueues;
this.feeder = feeder;
@@ -324,8 +328,8 @@ public class FetcherThread extends Thread {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
fetchQueues.finishFetchItem(fit, true);
- LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
- rules.getCrawlDelay());
+ LOG.info("Crawl-Delay for {} too long ({} ms), skipping",
+ fit.url, rules.getCrawlDelay());
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
@@ -334,7 +338,14 @@ public class FetcherThread extends Thread {
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.crawlDelay = rules.getCrawlDelay();
+ long crawlDelay = rules.getCrawlDelay();
+ if (crawlDelay < minCrawlDelay) {
+ LOG.info(
+ "Crawl-Delay for {} too short ({} ms), adjusting to {} ms",
+ fit.url, rules.getCrawlDelay(), minCrawlDelay);
+ crawlDelay = minCrawlDelay;
+ }
+ fiq.crawlDelay = crawlDelay;
if (LOG.isDebugEnabled()) {
LOG.debug("Crawl delay for queue: " + fit.queueID
+ " is set to " + fiq.crawlDelay