You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by te...@apache.org on 2013/01/28 08:59:41 UTC

svn commit: r1439291 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherReducer.java

Author: tejasp
Date: Mon Jan 28 07:59:41 2013
New Revision: 1439291

URL: http://svn.apache.org/viewvc?rev=1439291&view=rev
Log:
NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1439291&r1=1439290&r2=1439291&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Jan 28 07:59:41 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
+* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas Patil)
+
 * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)
 
 * NUTCH-1274 Fix [cast] javac warnings (Tejas Patil via lewismc)

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1439291&r1=1439290&r2=1439291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Mon Jan 28 07:59:41 2013
@@ -484,7 +484,8 @@ extends GoraReducer<IntWritable, FetchEn
             reprUrl = TableUtil.toString(fit.page.getReprUrl());
           }
           try {
-            LOG.info("fetching " + fit.url);
+            LOG.info("fetching " + fit.url + " (queue crawl delay=" + 
+                      fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)"); 
 
             // fetch the page
             final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
@@ -500,7 +501,7 @@ extends GoraReducer<IntWritable, FetchEn
               continue;
             }
             if (rules.getCrawlDelay() > 0) {
-              if (rules.getCrawlDelay() > maxCrawlDelay) {
+              if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                 // unblock
                 fetchQueues.finishFetchItem(fit, true);
                 LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
@@ -509,6 +510,9 @@ extends GoraReducer<IntWritable, FetchEn
               } else {
                 final FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                 fiq.crawlDelay = rules.getCrawlDelay();
+                if (LOG.isDebugEnabled()) {
+                  LOG.info("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + fit.url);
+                }
               }
             }
             final ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.page);
@@ -875,3 +879,4 @@ extends GoraReducer<IntWritable, FetchEn
     LOG.info("-activeThreads=" + activeThreads);
   }
 }
+