You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/05/12 21:39:43 UTC
svn commit: r1594071 - in /nutch: branches/2.x/ branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ trunk/ trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/

Author: snagel
Date: Mon May 12 19:39:43 2014
New Revision: 1594071

URL: http://svn.apache.org/r1594071
Log:
NUTCH-1752 Cache robots.txt rules per protocol:host:port

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
+
 * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche)
 
 * NUTCH-1182 fetcher to log hung threads (snagel)

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014
@@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend
     allowForbidden = conf.getBoolean("http.robots.403.allow", false);
   }
 
+  /** Compose unique key to store and access robot rules in cache for given URL */
+  protected static String getCacheKey(URL url) {
+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
+    String host = url.getHost().toLowerCase();          // normalize to lower case
+    int port = url.getPort();
+    if (port == -1) {
+      port = url.getDefaultPort();
+    }
+   /* Robot rules apply only to host, protocol, and port where robots.txt is
+    * hosted (cf. NUTCH-1752). Consequently  */
+    String cacheKey = protocol + ":" + host + ":" + port;
+    return cacheKey;
+  }
+
   /**
-   * The hosts for which the caching of robots rules is yet to be done,
-   * it sends a Http request to the host corresponding to the {@link URL} 
-   * passed, gets robots file, parses the rules and caches the rules object
-   * to avoid re-work in future.
+   * Get the rules from robots.txt which applies for the given {@code url}.
+   * Robot rules are cached for a unique combination of host, protocol, and
+   * port. If no rules are found in the cache, a HTTP request is send to fetch
+   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+   * rules are cached to avoid re-fetching and re-parsing it again.
    * 
-   *  @param http The {@link Protocol} object
-   *  @param url URL 
-   *  
-   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   * @param http
+   *          The {@link Protocol} object
+   * @param url
+   *          URL robots.txt applies to
+   *
+   * @return {@link BaseRobotRules} holding the rules from robots.txt
    */
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
-    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
-    String host = url.getHost().toLowerCase();          // normalize to lower case
-
-    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);
+    String cacheKey = getCacheKey(url);
+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
     boolean cacheRule = true;
     
@@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend
       }
 
       if (cacheRule) {
-        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host
-        if (redir != null && !redir.getHost().equals(host)) {
+        CACHE.put(cacheKey, robotRules);  // cache rules for host
+        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
           // cache also for the redirected host
-          CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+          CACHE.put(getCacheKey(redir), robotRules);
         }
       }
     }

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon May 12 19:39:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
+
 * NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche)
 
 * NUTCH-1766 Generator to unlock crawldb and remove tempdir if generate job fails (Diaa via jnioche)

Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1594071&r1=1594070&r2=1594071&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Mon May 12 19:39:43 2014
@@ -48,23 +48,38 @@ public class HttpRobotRulesParser extend
     allowForbidden = conf.getBoolean("http.robots.403.allow", false);
   }
 
+  /** Compose unique key to store and access robot rules in cache for given URL */
+  protected static String getCacheKey(URL url) {
+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
+    String host = url.getHost().toLowerCase();          // normalize to lower case
+    int port = url.getPort();
+    if (port == -1) {
+      port = url.getDefaultPort();
+    }
+   /* Robot rules apply only to host, protocol, and port where robots.txt is
+    * hosted (cf. NUTCH-1752). Consequently  */
+    String cacheKey = protocol + ":" + host + ":" + port;
+    return cacheKey;
+  }
+
   /**
-   * The hosts for which the caching of robots rules is yet to be done,
-   * it sends a Http request to the host corresponding to the {@link URL} 
-   * passed, gets robots file, parses the rules and caches the rules object
-   * to avoid re-work in future.
+   * Get the rules from robots.txt which applies for the given {@code url}.
+   * Robot rules are cached for a unique combination of host, protocol, and
+   * port. If no rules are found in the cache, a HTTP request is send to fetch
+   * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
+   * rules are cached to avoid re-fetching and re-parsing it again.
    * 
-   *  @param http The {@link Protocol} object
-   *  @param url URL 
-   *  
-   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   * @param http
+   *          The {@link Protocol} object
+   * @param url
+   *          URL robots.txt applies to
+   *
+   * @return {@link BaseRobotRules} holding the rules from robots.txt
    */
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
-    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
-    String host = url.getHost().toLowerCase();          // normalize to lower case
-
-    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);
+    String cacheKey = getCacheKey(url);
+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
     boolean cacheRule = true;
     
@@ -114,10 +129,10 @@ public class HttpRobotRulesParser extend
       }
 
       if (cacheRule) {
-        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host
-        if (redir != null && !redir.getHost().equals(host)) {
+        CACHE.put(cacheKey, robotRules);  // cache rules for host
+        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
           // cache also for the redirected host
-          CACHE.put(protocol + ":" + redir.getHost(), robotRules);
+          CACHE.put(getCacheKey(redir), robotRules);
         }
       }
     }