You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2009/11/25 18:20:34 UTC
svn commit: r884203 - in /lucene/nutch/trunk: CHANGES.txt
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Author: ab
Date: Wed Nov 25 17:20:33 2009
New Revision: 884203
URL: http://svn.apache.org/viewvc?rev=884203&view=rev
Log:
NUTCH-753 Prevent new Fetcher from retrieving the robots twice.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884203&r1=884202&r2=884203&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 17:20:33 2009
@@ -2,7 +2,9 @@
Unreleased Changes
-* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab)
+* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab)
+
+* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab)
* NUTCH-765 - Allow Crawl class to call Either Solr or Lucene Indexer (kubes)
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=884203&r1=884202&r2=884203&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed Nov 25 17:20:33 2009
@@ -185,6 +185,7 @@
String urlString = url.toString();
try {
URL u = new URL(urlString);
+ long delay = serverDelay;
if (checkRobots) {
try {
@@ -197,10 +198,10 @@
logger.trace("Exception checking robot rules for " + url + ": " + e);
}
}
+
+ long crawlDelay = robots.getCrawlDelay(this, u);
+ delay = crawlDelay > 0 ? crawlDelay : serverDelay;
}
-
- long crawlDelay = robots.getCrawlDelay(this, u);
- long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
if (checkBlocking && maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
// skip this page, otherwise the thread would block for too long.
LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="