You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/04/10 11:41:56 UTC

[nutch] branch master updated: NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed minimum delay is configured by `fetcher.min.crawl.delay` (default set equal to `fetcher.server.delay`)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new e6bc451  NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay - guaranteed minimum delay is configured by `fetcher.min.crawl.delay`   (default set equal to `fetcher.server.delay`)
     new e6d3e57  Merge pull request #506 from sebastian-nagel/NUTCH-2775-robots-min-delay
e6bc451 is described below

commit e6bc45181369ced98fa7d9df23685620938b0c9c
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Wed Mar 25 10:32:36 2020 +0100

    NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay
    - guaranteed minimum delay is configured by `fetcher.min.crawl.delay`
      (default set equal to `fetcher.server.delay`)
---
 conf/nutch-default.xml                               | 12 ++++++++++++
 src/java/org/apache/nutch/fetcher/FetcherThread.java | 17 ++++++++++++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 85d9933..6dfbe64 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -959,6 +959,18 @@
 </property> 
 
 <property>
+ <name>fetcher.min.crawl.delay</name>
+ <value>${fetcher.server.delay}</value>
+ <description>
+ Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the
+ robots.txt specifies a shorter delay. By default the minimum Crawl-Delay
+ is set to the value of `fetcher.server.delay` which guarantees that
+ a value set in the robots.txt cannot make the crawler more aggressive
+ than the default configuration.
+ </description>
+</property>
+
+<property>
   <name>fetcher.threads.fetch</name>
   <value>10</value>
   <description>The number of FetcherThreads the fetcher should use.
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 5d5a20b..549cd36 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -85,6 +85,7 @@ public class FetcherThread extends Thread {
   private URLNormalizers normalizers;
   private ProtocolFactory protocolFactory;
   private long maxCrawlDelay;
+  private long minCrawlDelay;
   private String queueMode;
   private int maxRedirect;
   private boolean maxRedirectExceededSkip = false;
@@ -165,6 +166,9 @@ public class FetcherThread extends Thread {
     this.protocolFactory = new ProtocolFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
     this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
+    float crawlDelay = conf.getFloat("fetcher.server.delay", 1.0f);
+    this.minCrawlDelay = (long) (conf.getFloat("fetcher.min.crawl.delay",
+        crawlDelay) * 1000);
     this.activeThreads = activeThreads;
     this.fetchQueues = fetchQueues;
     this.feeder = feeder;
@@ -324,8 +328,8 @@ public class FetcherThread extends Thread {
               if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
-                LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
-                    rules.getCrawlDelay());
+                LOG.info("Crawl-Delay for {} too long ({} ms), skipping",
+                    fit.url, rules.getCrawlDelay());
                 output(fit.url, fit.datum, null,
                     ProtocolStatus.STATUS_ROBOTS_DENIED,
                     CrawlDatum.STATUS_FETCH_GONE);
@@ -334,7 +338,14 @@ public class FetcherThread extends Thread {
                 continue;
               } else {
                 FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
-                fiq.crawlDelay = rules.getCrawlDelay();
+                long crawlDelay = rules.getCrawlDelay();
+                if (crawlDelay < minCrawlDelay) {
+                  LOG.info(
+                      "Crawl-Delay for {} too short ({} ms), adjusting to {} ms",
+                      fit.url, rules.getCrawlDelay(), minCrawlDelay);
+                  crawlDelay = minCrawlDelay;
+                }
+                fiq.crawlDelay = crawlDelay;
                 if (LOG.isDebugEnabled()) {
                   LOG.debug("Crawl delay for queue: " + fit.queueID
                       + " is set to " + fiq.crawlDelay