You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/04/22 03:46:28 UTC
svn commit: r1675243 [2/2] - in /nutch/trunk: ./ conf/ ivy/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache...

Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -29,6 +29,7 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.RobotRulesParser;
 
 import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
 
 /**
  * This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -86,31 +87,16 @@ public class HttpRobotRulesParser extend
    */
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
 
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
-    }
-
     String cacheKey = getCacheKey(url);
-    BaseRobotRules robotRules = CACHE.get(cacheKey);
-
-    if (robotRules != null) {
-      return robotRules; // cached rule
-    } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss " + url);
-    }
+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
 
     boolean cacheRule = true;
-    URL redir = null;
-
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
-      // (we do not need to fetch robots.txt)
-      robotRules = EMPTY_RULES;
-      LOG.info("Whitelisted host found for: {}", url);
-      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
-          url.getHost());
 
-    } else {
+    if (robotRules == null) { // cache miss
+      URL redir = null;
+      if (LOG.isTraceEnabled()) {
+        LOG.trace("cache miss " + url);
+      }
       try {
         Response response = ((HttpBase) http).getResponse(new URL(url,
             "/robots.txt"), new CrawlDatum(), true);
@@ -141,7 +127,7 @@ public class HttpRobotRulesParser extend
         else if ((response.getCode() == 403) && (!allowForbidden))
           robotRules = FORBID_ALL_RULES; // use forbid all
         else if (response.getCode() >= 500) {
-          cacheRule = false; // try again later to fetch robots.txt
+          cacheRule = false;
           robotRules = EMPTY_RULES;
         } else
           robotRules = EMPTY_RULES; // use default rules
@@ -149,19 +135,18 @@ public class HttpRobotRulesParser extend
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
         }
-        cacheRule = false; // try again later to fetch robots.txt
+        cacheRule = false;
         robotRules = EMPTY_RULES;
       }
-    }
 
-    if (cacheRule) {
-      CACHE.put(cacheKey, robotRules); // cache rules for host
-      if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
-        // cache also for the redirected host
-        CACHE.put(getCacheKey(redir), robotRules);
+      if (cacheRule) {
+        CACHE.put(cacheKey, robotRules); // cache rules for host
+        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+          // cache also for the redirected host
+          CACHE.put(getCacheKey(redir), robotRules);
+        }
       }
     }
-
     return robotRules;
   }
 }

Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1675243&r1=1675242&r2=1675243&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Wed Apr 22 01:46:28 2015
@@ -69,28 +69,15 @@ public class FtpRobotRulesParser extends
                                                        // case
     String host = url.getHost().toLowerCase(); // normalize to lower case
 
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
-    }
-
-    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
-
-    if (robotRules != null) {
-      return robotRules; // cached rule
-    } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss " + url);
-    }
+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":"
+        + host);
 
     boolean cacheRule = true;
 
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
-      // (we do not need to fetch robots.txt)
-      robotRules = EMPTY_RULES;
-      LOG.info("Whitelisted host found for: {}", url);
-      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+    if (robotRules == null) { // cache miss
+      if (LOG.isTraceEnabled())
+        LOG.trace("cache miss " + url);
 
-    } else {
       try {
         Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
         ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
@@ -107,15 +94,13 @@ public class FtpRobotRulesParser extends
         if (LOG.isInfoEnabled()) {
           LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
         }
-        cacheRule = false; // try again later to fetch robots.txt
+        cacheRule = false;
         robotRules = EMPTY_RULES;
       }
 
+      if (cacheRule)
+        CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
     }
-
-    if (cacheRule)
-      CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
-
     return robotRules;
   }
 }