You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/07/02 22:39:15 UTC

svn commit: r208872 - in /lucene/nutch/trunk: conf/nutch-default.xml src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java

Author: ab
Date: Sat Jul  2 13:39:14 2005
New Revision: 208872

URL: http://svn.apache.org/viewcvs?rev=208872&view=rev
Log:
Applied patches in NUTCH-56, with minor changes. Submitted by Andy Liu.

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
    lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jul  2 13:39:14 2005
@@ -23,6 +23,15 @@
 </property>
 
 <property>
+  <name>http.robots.403.allow</name>
+  <value>true</value>
+  <description>Some servers return HTTP status 403 (Forbidden) if
+  /robots.txt doesn't exist. This should probably mean that we are
+  allowed to crawl the site nonetheless. If this is set to false,
+  then such sites will be treated as forbidden.</description>
+</property>
+
+<property>
   <name>http.agent.description</name>
   <value>Nutch</value>
   <description>Further description of our bot- this text is used in

Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Sat Jul  2 13:39:14 2005
@@ -49,6 +49,9 @@
 public class RobotRulesParser {
   public static final Logger LOG=
     LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+  
+  private static final boolean ALLOW_FORBIDDEN =
+    NutchConf.get().getBoolean("http.robots.403.allow", false);
 
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
@@ -378,7 +381,7 @@
 
       if (response.getCode() == 200)               // found rules: parse them
         robotRules = new RobotRulesParser().parseRules(response.getContent());
-      else if (response.getCode() == 403)
+      else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
         robotRules = FORBID_ALL_RULES;            // use forbid all
       else                                        
         robotRules = EMPTY_RULES;                 // use default rules

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Sat Jul  2 13:39:14 2005
@@ -50,6 +50,9 @@
   public static final Logger LOG=
     LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
 
+  private static final boolean ALLOW_FORBIDDEN =
+    NutchConf.get().getBoolean("http.robots.403.allow", false);
+
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
@@ -380,7 +383,7 @@
 
         if (response.getCode() == 200)               // found rules: parse them
           robotRules = new RobotRulesParser().parseRules(response.getContent());
-        else if (response.getCode() == 403)
+        else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
           robotRules = FORBID_ALL_RULES;            // use forbid all
         else                                        
           robotRules = EMPTY_RULES;                 // use default rules