You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by te...@apache.org on 2013/01/28 08:54:30 UTC

svn commit: r1439289 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

Author: tejasp
Date: Mon Jan 28 07:54:30 2013
New Revision: 1439289

URL: http://svn.apache.org/viewvc?rev=1439289&view=rev
Log:
NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1439289&r1=1439288&r2=1439289&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jan 28 07:54:30 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas Patil)
+
 * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)
 
 * NUTCH-840  Port tests from parse-html to parse-tika (lewismc, jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1439289&r1=1439288&r2=1439289&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Jan 28 07:54:30 2013
@@ -662,7 +662,10 @@ public class Fetcher extends Configured 
             redirecting = false;
             redirectCount = 0;
             do {
-              if (LOG.isInfoEnabled()) { LOG.info("fetching " + fit.url); }
+              if (LOG.isInfoEnabled()) {
+                LOG.info("fetching " + fit.url + " (queue crawl delay=" + 
+                         fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); 
+              }
               if (LOG.isDebugEnabled()) {
                 LOG.debug("redirectCount=" + redirectCount);
               }
@@ -680,7 +683,7 @@ public class Fetcher extends Configured 
                 continue;
               }
               if (rules.getCrawlDelay() > 0) {
-                if (rules.getCrawlDelay() > maxCrawlDelay) {
+                if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                   // unblock
                   fetchQueues.finishFetchItem(fit, true);
                   LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
@@ -690,6 +693,9 @@ public class Fetcher extends Configured 
                 } else {
                   FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                   fiq.crawlDelay = rules.getCrawlDelay();
+                  if (LOG.isDebugEnabled()) {
+                    LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
+                  }
                 }
               }
               ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);