You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/11/07 18:53:57 UTC
svn commit: r331555 - in
/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient:
HttpResponse.java RobotRulesParser.java
Author: cutting
Date: Mon Nov 7 09:53:54 2005
New Revision: 331555
URL: http://svn.apache.org/viewcvs?rev=331555&view=rev
Log:
NUTCH-124: Follow redirects when fetching robots.txt.
Modified:
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=331555&r1=331554&r2=331555&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Mon Nov 7 09:53:54 2005
@@ -82,10 +82,14 @@
}
public HttpResponse(URL url) throws IOException {
+ this(url, false);
+ }
+
+ HttpResponse(URL url, boolean followRedirects) throws IOException {
this.base = url.toString();
this.orig = url.toString();
GetMethod get = new GetMethod(this.orig);
- get.setFollowRedirects(false);
+ get.setFollowRedirects(followRedirects);
get.setRequestHeader("User-Agent", Http.AGENT_STRING);
HttpMethodParams params = get.getParams();
// some servers cannot digest the new protocol
Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=331555&r1=331554&r2=331555&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original)
+++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Mon Nov 7 09:53:54 2005
@@ -379,7 +379,8 @@
if (robotRules == null) { // cache miss
LOG.fine("cache miss " + url);
try {
- HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+ HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
+ true);
if (response.getCode() == 200) // found rules: parse them
robotRules = new RobotRulesParser().parseRules(response.getContent());