You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2022/05/19 13:25:53 UTC

[nutch] 01/02: NUTCH-2946 Fetcher: slow down fetching from hosts where requests fail repeatedly with exceptions or HTTP status codes mapped to ProtocolStatus.EXCEPTION (HTTP 403 Forbidden, 429 Too many requests, 5xx server errors, etc.)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 42ae2a34505e23319861e7b31fd9f87f1af68749
Author: Sebastian Nagel <se...@commoncrawl.org>
AuthorDate: Fri Jan 14 18:31:31 2022 +0100

    NUTCH-2946 Fetcher: slow down fetching from hosts where requests fail repeatedly
    with exceptions or HTTP status codes mapped to ProtocolStatus.EXCEPTION
    (HTTP 403 Forbidden, 429 Too many requests, 5xx server errors, etc.)
---
 conf/nutch-default.xml                                | 19 +++++++++++++++----
 .../org/apache/nutch/fetcher/FetchItemQueues.java     | 12 ++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 29a4716b5..7775fc70d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1081,10 +1081,21 @@
 <property>
   <name>fetcher.max.exceptions.per.queue</name>
   <value>-1</value>
-  <description>The maximum number of protocol-level exceptions (e.g. timeouts) per
-  host (or IP) queue. Once this value is reached, any remaining entries from this
-  queue are purged, effectively stopping the fetching from this host/IP. The default
-  value of -1 deactivates this limit.
+  <description>The maximum number of protocol-level exceptions
+  (e.g. timeouts) or HTTP status codes mapped to ProtocolStatus.EXCEPTION
+  per host (or IP) queue. Once this value is reached, any remaining entries
+  from this queue are purged, effectively stopping the fetching from this
+  host/IP. The default value of -1 deactivates this limit.
+  </description>
+</property>
+
+<property>
+  <name>fetcher.exceptions.per.queue.delay</name>
+  <value>-1</value>
+  <description>Additional delay in milliseconds slowing down fetches from a queue
+  if an exception has occurred (see also fetcher.max.exceptions.per.queue).
+  The delay grows logarithmically with the number of observed exceptions:
+     delay = fetcher.exceptions.per.queue.delay * log2(1 + num_exception_in_queue)
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index ceb8cab59..0faf391ce 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -56,6 +56,7 @@ public class FetchItemQueues {
   long minCrawlDelay;
   long timelimit = -1;
   int maxExceptionsPerQueue = -1;
+  long exceptionsPerQueueDelay = -1;
   Configuration conf;
 
   public static final String QUEUE_MODE_HOST = "byHost";
@@ -84,6 +85,8 @@ public class FetchItemQueues {
     this.timelimit = conf.getLong("fetcher.timelimit", -1);
     this.maxExceptionsPerQueue = conf.getInt(
         "fetcher.max.exceptions.per.queue", -1);
+    this.exceptionsPerQueueDelay = conf
+        .getLong("fetcher.exceptions.per.queue.delay", -1);
 
     int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
         -1);
@@ -268,6 +271,15 @@ public class FetchItemQueues {
     if (delay > 0) {
       fiq.nextFetchTime.getAndAdd(delay);
       LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay);
+    } else if (exceptionsPerQueueDelay > 0) {
+      // delay the next fetch by a time span growing at log scale
+      // with the number of observed exceptions
+      long exceptionDelay = (long) (exceptionsPerQueueDelay
+          * Math.log(1 + excCount) / Math.log(2));
+      fiq.nextFetchTime.getAndAdd(exceptionDelay);
+      LOG.info(
+          "* queue: {} >> delayed next fetch by {} ms after {} exceptions in queue",
+          queueid, exceptionDelay, excCount);
     }
     if (fiq.getQueueSize() == 0) {
       return 0;