You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2018/11/15 10:34:01 UTC

[nutch] 02/14: NUTCH-2630 Fetcher to log skipped records by robots.txt - change required log level to INFO (default) for messages reporting skipped URLs because of robots.txt rules (disallow or crawl delay larger than fetcher.max.crawl.delay)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 524a59480a3e258a0363faf343fa57875f8f9ea8
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Mon Oct 8 14:50:51 2018 +0200

    NUTCH-2630 Fetcher to log skipped records by robots.txt
    - change required log level to INFO (default) for messages
      reporting skipped URLs because of robots.txt rules
      (disallow or crawl delay larger than fetcher.max.crawl.delay)
---
 src/java/org/apache/nutch/fetcher/FetcherThread.java | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index bfcc374..6ba920e 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -302,9 +302,7 @@ public class FetcherThread extends Thread {
             if (!rules.isAllowed(fit.url.toString())) {
               // unblock
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-              if (LOG.isDebugEnabled()) {
-                LOG.debug("Denied by robots.txt: {}", fit.url);
-              }
+              LOG.info("Denied by robots.txt: {}", fit.url);
               output(fit.url, fit.datum, null,
                   ProtocolStatus.STATUS_ROBOTS_DENIED,
                   CrawlDatum.STATUS_FETCH_GONE);
@@ -315,7 +313,7 @@ public class FetcherThread extends Thread {
               if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                 // unblock
                 ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-                LOG.debug("Crawl-Delay for {} too long ({}), skipping", fit.url,
+                LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
                     rules.getCrawlDelay());
                 output(fit.url, fit.datum, null,
                     ProtocolStatus.STATUS_ROBOTS_DENIED,