You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/07/02 22:39:15 UTC
svn commit: r208872 - in /lucene/nutch/trunk: conf/nutch-default.xml
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
Author: ab
Date: Sat Jul 2 13:39:14 2005
New Revision: 208872
URL: http://svn.apache.org/viewcvs?rev=208872&view=rev
Log:
Applied patches in NUTCH-56, with minor changes. Submitted by Andy Liu.
Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Sat Jul 2 13:39:14 2005
@@ -23,6 +23,15 @@
</property>
<property>
+ <name>http.robots.403.allow</name>
+ <value>true</value>
+ <description>Some servers return HTTP status 403 (Forbidden) if
+ /robots.txt doesn't exist. This should probably mean that we are
+ allowed to crawl the site nonetheless. If this is set to false,
+ then such sites will be treated as forbidden.</description>
+</property>
+
+<property>
<name>http.agent.description</name>
<value>Nutch</value>
<description>Further description of our bot- this text is used in
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Sat Jul 2 13:39:14 2005
@@ -49,6 +49,9 @@
public class RobotRulesParser {
public static final Logger LOG=
LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+
+ private static final boolean ALLOW_FORBIDDEN =
+ NutchConf.get().getBoolean("http.robots.403.allow", false);
private static final String[] AGENTS = getAgents();
private static final Hashtable CACHE = new Hashtable();
@@ -378,7 +381,7 @@
if (response.getCode() == 200) // found rules: parse them
robotRules = new RobotRulesParser().parseRules(response.getContent());
- else if (response.getCode() == 403)
+ else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
robotRules = FORBID_ALL_RULES; // use forbid all
else
robotRules = EMPTY_RULES; // use default rules
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=208872&r1=208871&r2=208872&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Sat Jul 2 13:39:14 2005
@@ -50,6 +50,9 @@
public static final Logger LOG=
LogFormatter.getLogger("org.apache.nutch.fetcher.RobotRulesParser");
+ private static final boolean ALLOW_FORBIDDEN =
+ NutchConf.get().getBoolean("http.robots.403.allow", false);
+
private static final String[] AGENTS = getAgents();
private static final Hashtable CACHE = new Hashtable();
@@ -380,7 +383,7 @@
if (response.getCode() == 200) // found rules: parse them
robotRules = new RobotRulesParser().parseRules(response.getContent());
- else if (response.getCode() == 403)
+ else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
robotRules = FORBID_ALL_RULES; // use forbid all
else
robotRules = EMPTY_RULES; // use default rules