You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/09/19 16:52:37 UTC

svn commit: r447867 - in /lucene/nutch/branches/branch-0.8: CHANGES.txt src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Author: siren
Date: Tue Sep 19 07:52:37 2006
New Revision: 447867

URL: http://svn.apache.org/viewvc?view=rev&rev=447867
Log:
NUTCH-105 - Network error during robots.txt fetch causes file to beignored, contributed by Greg Kim

Modified:
    lucene/nutch/branches/branch-0.8/CHANGES.txt
    lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java

Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Tue Sep 19 07:52:37 2006
@@ -22,6 +22,9 @@
  7. NUTCH-338 - Remove the text parser as an option for parsing PDF files
     in parse-plugins.xml (Chris A. Mattmann via siren)
 
+ 8. NUTCH-105 - Network error during robots.txt fetch causes file to
+    beignored (Greg Kim via siren)
+    
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=447867&r1=447866&r2=447867
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Sep 19 07:52:37 2006
@@ -420,6 +420,8 @@
 
     RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
+    boolean cacheRule = true;
+    
     if (robotRules == null) {                     // cache miss
       if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }
       try {
@@ -430,16 +432,22 @@
           robotRules = parseRules(response.getContent());
         else if ( (response.getCode() == 403) && (!allowForbidden) )
           robotRules = FORBID_ALL_RULES;            // use forbid all
-        else                                        
+        else if (response.getCode() >= 500) {
+	  cacheRule = false;
+	  robotRules = EMPTY_RULES;
+        }else                                        
           robotRules = EMPTY_RULES;                 // use default rules
       } catch (Throwable t) {
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
         }
+        cacheRule = false;
         robotRules = EMPTY_RULES;
       }
 
-      CACHE.put(host, robotRules);                // cache rules for host
+      if (cacheRule){
+	CACHE.put(host, robotRules);  // cache rules for host
+      }
     }
     return robotRules;
   }