You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2022/01/18 07:22:42 UTC
[nutch] branch master updated: NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status (#724)
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new f691bae NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status (#724)
f691bae is described below
commit f691baebc3c04c08ea500f4767e2decb88c30c70
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Jan 18 08:22:36 2022 +0100
NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status (#724)
NUTCH-2573 Suspend crawling if robots.txt fails to fetch with 5xx status
- add properties
http.robots.503.defer.visits :
enable/disable the feature (default: enabled)
http.robots.503.defer.visits.delay :
delay to wait before the next trial to fetch the deferred URL
and the corresponding robots.txt
(default: wait 5 minutes)
http.robots.503.defer.visits.retries :
max. number of retries before giving up and dropping all URLs from
the given host / queue
(default: give up after the 3rd retry, ie. after 4 attempts)
- handle HTTP 5xx in robots.txt parser
- handle delay, retries and dropping queues in Fetcher
- count dropped fetch items in `robots_defer_visits_dropped`
---
conf/nutch-default.xml | 26 +++++++++
.../org/apache/nutch/fetcher/FetchItemQueues.java | 64 +++++++++++++++++-----
.../org/apache/nutch/fetcher/FetcherThread.java | 39 ++++++++++++-
.../apache/nutch/protocol/RobotRulesParser.java | 13 +++++
.../protocol/http/api/HttpRobotRulesParser.java | 15 ++++-
5 files changed, 140 insertions(+), 17 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c305fa8..29a4716 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -130,6 +130,32 @@
</property>
<property>
+ <name>http.robots.503.defer.visits</name>
+ <value>true</value>
+ <description>Temporarily suspend fetching from a host if the
+ robots.txt response is HTTP 503 or any other 5xx server error. See
+ also http.robots.503.defer.visits.delay and
+ http.robots.503.defer.visits.retries</description>
+</property>
+
+<property>
+ <name>http.robots.503.defer.visits.delay</name>
+ <value>300000</value>
+ <description>Time in milliseconds to suspend crawling a host if the
+ robots.txt response is HTTP 5xx - see
+ http.robots.503.defer.visits.</description>
+</property>
+
+<property>
+ <name>http.robots.503.defer.visits.retries</name>
+ <value>3</value>
+ <description>Number of retries crawling a host if the robots.txt
+ response is HTTP 5xx - see http.robots.503.defer.visits. After n
+ retries the host queue is dropped for this segment/cycle.
+ </description>
+</property>
+
+<property>
<name>http.agent.description</name>
<value></value>
<description>Further description of our bot- this text is used in
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index 00a0784..ceb8cab 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -195,11 +195,19 @@ public class FetchItemQueues {
return null;
}
+ /**
+ * @return true if the fetcher timelimit is defined and has been exceeded
+ * ({@code fetcher.timelimit.mins} minutes after fetching started)
+ */
+ public boolean timelimitExceeded() {
+ return timelimit != -1 && System.currentTimeMillis() >= timelimit;
+ }
+
// called only once the feeder has stopped
public synchronized int checkTimelimit() {
int count = 0;
- if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
+ if (timelimitExceeded()) {
// emptying the queues
count = emptyQueues();
@@ -209,6 +217,7 @@ public class FetchItemQueues {
if (totalSize.get() != 0 && queues.size() == 0)
totalSize.set(0);
}
+
return count;
}
@@ -220,11 +229,9 @@ public class FetchItemQueues {
FetchItemQueue fiq = queues.get(id);
if (fiq.getQueueSize() == 0)
continue;
- LOG.info("* queue: " + id + " >> dropping! ");
+ LOG.info("* queue: {} >> dropping!", id);
int deleted = fiq.emptyQueue();
- for (int i = 0; i < deleted; i++) {
- totalSize.decrementAndGet();
- }
+ totalSize.addAndGet(-deleted);
count += deleted;
}
@@ -235,26 +242,43 @@ public class FetchItemQueues {
* Increment the exception counter of a queue in case of an exception e.g.
* timeout; when higher than a given threshold simply empty the queue.
*
- * @param queueid a queue identifier to locate and check
+ * The next fetch is delayed if specified by the param {@code delay} or
+ * configured by the property {@code fetcher.exceptions.per.queue.delay}.
+ *
+ * @param queueid
+ * a queue identifier to locate and check
+ * @param maxExceptions
+ * custom-defined number of max. exceptions - if negative the value
+ * of the property {@code fetcher.max.exceptions.per.queue} is used.
+ * @param delay
+ * a custom-defined time span in milliseconds to delay the next fetch
+ * in addition to the delay defined for the given queue. If a
+ * negative value is passed the delay is chosen by
+ * {@code fetcher.exceptions.per.queue.delay}
+ *
* @return number of purged items
*/
- public synchronized int checkExceptionThreshold(String queueid) {
+ public synchronized int checkExceptionThreshold(String queueid,
+ int maxExceptions, long delay) {
FetchItemQueue fiq = queues.get(queueid);
if (fiq == null) {
return 0;
}
int excCount = fiq.incrementExceptionCounter();
+ if (delay > 0) {
+ fiq.nextFetchTime.getAndAdd(delay);
+ LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay);
+ }
if (fiq.getQueueSize() == 0) {
return 0;
}
- if (maxExceptionsPerQueue != -1 && excCount >= maxExceptionsPerQueue) {
+ if (maxExceptions!= -1 && excCount >= maxExceptions) {
// too many exceptions for items in this queue - purge it
int deleted = fiq.emptyQueue();
- LOG.info("* queue: " + queueid + " >> removed " + deleted
- + " URLs from queue because " + excCount + " exceptions occurred");
- for (int i = 0; i < deleted; i++) {
- totalSize.decrementAndGet();
- }
+ LOG.info(
+ "* queue: {} >> removed {} URLs from queue because {} exceptions occurred",
+ queueid, deleted, excCount);
+ totalSize.getAndAdd(-deleted);
// keep queue IDs to ensure that these queues aren't created and filled
// again, see addFetchItem(FetchItem)
queuesMaxExceptions.add(queueid);
@@ -264,6 +288,20 @@ public class FetchItemQueues {
}
/**
+ * Increment the exception counter of a queue in case of an exception e.g.
+ * timeout; when higher than a given threshold simply empty the queue.
+ *
+ * @see #checkExceptionThreshold(String, int, long)
+ *
+ * @param queueid
+ * queue identifier to locate and check
+ * @return number of purged items
+ */
+ public int checkExceptionThreshold(String queueid) {
+ return checkExceptionThreshold(queueid, this.maxExceptionsPerQueue, -1);
+ }
+
+ /**
* @param redirUrl
* redirect target
* @return true if redirects are deduplicated and redirUrl has been queued
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 40b7201..d5fe343 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -139,6 +139,8 @@ public class FetcherThread extends Thread {
private AtomicLong bytes;
private List<Content> robotsTxtContent = null;
+ private long robotsDeferVisitsDelay;
+ private int robotsDeferVisitsRetries;
//Used by the REST service
private FetchNode fetchNode;
@@ -194,6 +196,14 @@ public class FetcherThread extends Thread {
URLNormalizers.SCOPE_OUTLINK);
}
+ // NUTCH-2573 defer visits if robots.txt fails with HTTP 5xx
+ if (conf.getBoolean("http.robots.503.defer.visits", true)) {
+ this.robotsDeferVisitsDelay = conf
+ .getLong("http.robots.503.defer.visits.delay", 5 * 60 * 1000L);
+ this.robotsDeferVisitsRetries = conf
+ .getInt("http.robots.503.defer.visits.retries", 3);
+ }
+
if((activatePublisher=conf.getBoolean("fetcher.publisher", false)))
this.publisher = new FetcherThreadPublisher(conf);
@@ -312,6 +322,25 @@ public class FetcherThread extends Thread {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
+ if (rules.isDeferVisits()) {
+ LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url);
+ // retry the fetch item
+ if (fetchQueues.timelimitExceeded()) {
+ fetchQueues.finishFetchItem(fit, true);
+ } else {
+ fetchQueues.addFetchItem(fit);
+ }
+ // but check whether it's time to cancel the queue
+ int killedURLs = fetchQueues.checkExceptionThreshold(
+ fit.getQueueID(), this.robotsDeferVisitsRetries + 1,
+ this.robotsDeferVisitsDelay);
+ if (killedURLs != 0) {
+ context
+ .getCounter("FetcherStatus", "robots_defer_visits_dropped")
+ .increment(killedURLs);
+ }
+ continue;
+ }
if (!rules.isAllowed(fit.url.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
@@ -600,6 +629,12 @@ public class FetcherThread extends Thread {
LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
redirUrl);
return null;
+ } else if (fetchQueues.timelimitExceeded()) {
+ redirecting = false;
+ context.getCounter("FetcherStatus", "hitByTimeLimit").increment(1);
+ LOG.debug(" - ignoring redirect from {} to {} - timelimit reached",
+ fit.url, redirUrl);
+ return null;
}
CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
fit = FetchItem.create(redirUrl, newDatum, queueMode);
@@ -780,8 +815,10 @@ public class FetcherThread extends Thread {
reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
publisher.publish(reportEvent, conf);
}
+
// Only process depth N outlinks
- if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
+ if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth
+ && !fetchQueues.timelimitExceeded()) {
FetchItem ft = FetchItem.create(url, null, queueMode);
FetchItemQueue queue = fetchQueues.getFetchItemQueue(ft.queueID);
queue.alreadyFetched.add(url.toString().hashCode());
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 97256d2..1493bc2 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -77,6 +77,19 @@ public abstract class RobotRulesParser implements Tool {
public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(
RobotRulesMode.ALLOW_NONE);
+ /**
+ * A {@link BaseRobotRules} object appropriate for use when the
+ * {@code robots.txt} file failed to fetch with a 503 "Internal Server
+ * Error" (or other 5xx) status code. The crawler should suspend crawling
+ * for a certain (but not too long) time, see property
+ * <code>http.robots.503.defer.visits</code>.
+ */
+ public static final BaseRobotRules DEFER_VISIT_RULES = new SimpleRobotRules(
+ RobotRulesMode.ALLOW_NONE);
+ static {
+ DEFER_VISIT_RULES.setDeferVisits(true);
+ }
+
private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
static {
robotParser.setMaxCrawlDelay(Long.MAX_VALUE);
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 4f3afd3..ad2521b 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -41,7 +41,9 @@ public class HttpRobotRulesParser extends RobotRulesParser {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+
protected boolean allowForbidden = false;
+ protected boolean deferVisits503 = false;
HttpRobotRulesParser() {
}
@@ -53,6 +55,7 @@ public class HttpRobotRulesParser extends RobotRulesParser {
public void setConf(Configuration conf) {
super.setConf(conf);
allowForbidden = conf.getBoolean("http.robots.403.allow", true);
+ deferVisits503 = conf.getBoolean("http.robots.503.defer.visits", true);
}
/**
@@ -110,7 +113,7 @@ public class HttpRobotRulesParser extends RobotRulesParser {
if (robotRules != null) {
return robotRules; // cached rule
} else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
+ LOG.trace("cache miss {}", url);
}
boolean cacheRule = true;
@@ -163,9 +166,15 @@ public class HttpRobotRulesParser extends RobotRulesParser {
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
cacheRule = false; // try again later to fetch robots.txt
- robotRules = EMPTY_RULES;
- } else
+ if (deferVisits503) {
+ // signal fetcher to suspend crawling for this host
+ robotRules = DEFER_VISIT_RULES;
+ } else {
+ robotRules = EMPTY_RULES;
+ }
+ } else {
robotRules = EMPTY_RULES; // use default rules
+ }
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());