You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/19 16:52:37 UTC
svn commit: r447867 - in /lucene/nutch/branches/branch-0.8: CHANGES.txt
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Author: siren
Date: Tue Sep 19 07:52:37 2006
New Revision: 447867
URL: http://svn.apache.org/viewvc?view=rev&rev=447867
Log:
NUTCH-105 - Network error during robots.txt fetch causes file to beignored, contributed by Greg Kim
Modified:
lucene/nutch/branches/branch-0.8/CHANGES.txt
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Tue Sep 19 07:52:37 2006
@@ -22,6 +22,9 @@
7. NUTCH-338 - Remove the text parser as an option for parsing PDF files
in parse-plugins.xml (Chris A. Mattmann via siren)
+ 8. NUTCH-105 - Network error during robots.txt fetch causes file to
+ beignored (Greg Kim via siren)
+
Release 0.8 - 2006-07-25
0. Totally new architecture, based on hadoop
Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Sep 19 07:52:37 2006
@@ -420,6 +420,8 @@
RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
+ boolean cacheRule = true;
+
if (robotRules == null) { // cache miss
if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
try {
@@ -430,16 +432,22 @@
robotRules = parseRules(response.getContent());
else if ( (response.getCode() == 403) && (!allowForbidden) )
robotRules = FORBID_ALL_RULES; // use forbid all
- else
+ else if (response.getCode() >= 500) {
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }else
robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
+ cacheRule = false;
robotRules = EMPTY_RULES;
}
- CACHE.put(host, robotRules); // cache rules for host
+ if (cacheRule){
+ CACHE.put(host, robotRules); // cache rules for host
+ }
}
return robotRules;
}