You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/05/12 21:39:43 UTC
svn commit: r1594071 - in /nutch: branches/2.x/
branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
trunk/ trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
Author: snagel
Date: Mon May 12 19:39:43 2014
New Revision: 1594071
URL: http://svn.apache.org/r1594071
Log:
NUTCH-1752 Cache robots.txt rules per protocol:host:port
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
+
* NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche)
* NUTCH-1182 fetcher to log hung threads (snagel)
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014
@@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend
allowForbidden = conf.getBoolean("http.robots.403.allow", false);
}
+ /** Compose unique key to store and access robot rules in cache for given URL */
+ protected static String getCacheKey(URL url) {
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+ int port = url.getPort();
+ if (port == -1) {
+ port = url.getDefaultPort();
+ }
+ /* Robot rules apply only to host, protocol, and port where robots.txt is
+ * hosted (cf. NUTCH-1752). Consequently */
+ String cacheKey = protocol + ":" + host + ":" + port;
+ return cacheKey;
+ }
+
/**
- * The hosts for which the caching of robots rules is yet to be done,
- * it sends a Http request to the host corresponding to the {@link URL}
- * passed, gets robots file, parses the rules and caches the rules object
- * to avoid re-work in future.
+ * Get the rules from robots.txt which applies for the given {@code url}.
+ * Robot rules are cached for a unique combination of host, protocol, and
+ * port. If no rules are found in the cache, a HTTP request is send to fetch
+ * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+ * rules are cached to avoid re-fetching and re-parsing it again.
*
- * @param http The {@link Protocol} object
- * @param url URL
- *
- * @return robotRules A {@link BaseRobotRules} object for the rules
+ * @param http
+ * The {@link Protocol} object
+ * @param url
+ * URL robots.txt applies to
+ *
+ * @return {@link BaseRobotRules} holding the rules from robots.txt
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
- String host = url.getHost().toLowerCase(); // normalize to lower case
-
- BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);
+ String cacheKey = getCacheKey(url);
+ BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
boolean cacheRule = true;
@@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend
}
if (cacheRule) {
- CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equals(host)) {
+ CACHE.put(cacheKey, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
- CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+ CACHE.put(getCacheKey(redir), robotRules);
}
}
}
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
+
* NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche)
* NUTCH-1766 Generator to unlock crawldb and remove tempdir if generate job fails (Diaa via jnioche)
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014
@@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend
allowForbidden = conf.getBoolean("http.robots.403.allow", false);
}
+ /** Compose unique key to store and access robot rules in cache for given URL */
+ protected static String getCacheKey(URL url) {
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+ int port = url.getPort();
+ if (port == -1) {
+ port = url.getDefaultPort();
+ }
+ /* Robot rules apply only to host, protocol, and port where robots.txt is
+ * hosted (cf. NUTCH-1752). Consequently */
+ String cacheKey = protocol + ":" + host + ":" + port;
+ return cacheKey;
+ }
+
/**
- * The hosts for which the caching of robots rules is yet to be done,
- * it sends a Http request to the host corresponding to the {@link URL}
- * passed, gets robots file, parses the rules and caches the rules object
- * to avoid re-work in future.
+ * Get the rules from robots.txt which applies for the given {@code url}.
+ * Robot rules are cached for a unique combination of host, protocol, and
+ * port. If no rules are found in the cache, a HTTP request is send to fetch
+ * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+ * rules are cached to avoid re-fetching and re-parsing it again.
*
- * @param http The {@link Protocol} object
- * @param url URL
- *
- * @return robotRules A {@link BaseRobotRules} object for the rules
+ * @param http
+ * The {@link Protocol} object
+ * @param url
+ * URL robots.txt applies to
+ *
+ * @return {@link BaseRobotRules} holding the rules from robots.txt
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
- String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
- String host = url.getHost().toLowerCase(); // normalize to lower case
-
- BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);
+ String cacheKey = getCacheKey(url);
+ BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
boolean cacheRule = true;
@@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend
}
if (cacheRule) {
- CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equals(host)) {
+ CACHE.put(cacheKey, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
- CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+ CACHE.put(getCacheKey(redir), robotRules);
}
}
}