You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/22 03:46:28 UTC
svn commit: r1675243 [2/2] - in /nutch/trunk: ./ conf/ ivy/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/protocol/ src/java/org/apache...
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.RobotRulesParser;
import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
/**
* This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -86,31 +87,16 @@ public class HttpRobotRulesParser extend
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
- }
-
String cacheKey = getCacheKey(url);
- BaseRobotRules robotRules = CACHE.get(cacheKey);
-
- if (robotRules != null) {
- return robotRules; // cached rule
- } else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
- }
+ BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
boolean cacheRule = true;
- URL redir = null;
-
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
- // (we do not need to fetch robots.txt)
- robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
- url.getHost());
- } else {
+ if (robotRules == null) { // cache miss
+ URL redir = null;
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
try {
Response response = ((HttpBase) http).getResponse(new URL(url,
"/robots.txt"), new CrawlDatum(), true);
@@ -141,7 +127,7 @@ public class HttpRobotRulesParser extend
else if ((response.getCode() == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
- cacheRule = false; // try again later to fetch robots.txt
+ cacheRule = false;
robotRules = EMPTY_RULES;
} else
robotRules = EMPTY_RULES; // use default rules
@@ -149,19 +135,18 @@ public class HttpRobotRulesParser extend
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
- cacheRule = false; // try again later to fetch robots.txt
+ cacheRule = false;
robotRules = EMPTY_RULES;
}
- }
- if (cacheRule) {
- CACHE.put(cacheKey, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
- // cache also for the redirected host
- CACHE.put(getCacheKey(redir), robotRules);
+ if (cacheRule) {
+ CACHE.put(cacheKey, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+ // cache also for the redirected host
+ CACHE.put(getCacheKey(redir), robotRules);
+ }
}
}
-
return robotRules;
}
}
Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -69,28 +69,15 @@ public class FtpRobotRulesParser extends
// case
String host = url.getHost().toLowerCase(); // normalize to lower case
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
- }
-
- BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
-
- if (robotRules != null) {
- return robotRules; // cached rule
- } else if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
- }
+ BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":"
+ + host);
boolean cacheRule = true;
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
- // (we do not need to fetch robots.txt)
- robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+ if (robotRules == null) { // cache miss
+ if (LOG.isTraceEnabled())
+ LOG.trace("cache miss " + url);
- } else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
@@ -107,15 +94,13 @@ public class FtpRobotRulesParser extends
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
- cacheRule = false; // try again later to fetch robots.txt
+ cacheRule = false;
robotRules = EMPTY_RULES;
}
+ if (cacheRule)
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
}
-
- if (cacheRule)
- CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
-
return robotRules;
}
}