You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/08/14 17:08:15 UTC

svn commit: r431366 - in /lucene/nutch/branches/branch-0.8: ./ conf/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/

Author: ab
Date: Mon Aug 14 08:08:14 2006
New Revision: 431366

URL: http://svn.apache.org/viewvc?rev=431366&view=rev
Log:
Apply patches in rev 431364.

Modified:
    lucene/nutch/branches/branch-0.8/CHANGES.txt
    lucene/nutch/branches/branch-0.8/conf/nutch-default.xml
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java
    lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Mon Aug 14 08:08:14 2006
@@ -10,6 +10,9 @@
     
  3. NUTCH-344 - Fix for thread blocking issue (Greg Kim via siren)
 
+ 4. Optionally skip pages with abnormally large Crawl-Delay values
+    (Dennis Kubes via ab)
+
 Release 0.8 - 2006-07-25
 
  0. Totally new architecture, based on hadoop

Modified: lucene/nutch/branches/branch-0.8/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/conf/nutch-default.xml?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/branch-0.8/conf/nutch-default.xml Mon Aug 14 08:08:14 2006
@@ -380,6 +380,18 @@
 </property>
 
 <property>
+ <name>fetcher.max.crawl.delay</name>
+ <value>30</value>
+ <description>
+ If the Crawl-Delay in robots.txt is set to greater than this value (in
+ seconds) then the fetcher will skip this page, generating an error report.
+ If set to -1 the fetcher will never skip such pages and will wait the
+ amount of time retrieved from robots.txt Crawl-Delay, however long that
+ might be.
+ </description>
+</property> 
+
+<property>
   <name>fetcher.threads.fetch</name>
   <value>10</value>
   <description>The number of FetcherThreads the fetcher should use.

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Aug 14 08:08:14 2006
@@ -199,6 +199,7 @@
               case ProtocolStatus.NOTFOUND:
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
+              case ProtocolStatus.WOULDBLOCK:
               case ProtocolStatus.NOTMODIFIED:
                 output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
                 break;

Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java Mon Aug 14 08:08:14 2006
@@ -25,7 +25,7 @@
 import org.apache.hadoop.io.WritableUtils;
 
 /**
- * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ * @author Andrzej Bialecki
  */
 public class ProtocolStatus extends VersionedWritable {
   
@@ -55,11 +55,15 @@
   /** Access denied by robots.txt rules. */
   public static final int ROBOTS_DENIED        = 18;
   /** Too many redirects. */
-  public static final int REDIR_EXCEEDED         = 19;
+  public static final int REDIR_EXCEEDED       = 19;
   /** Not fetching. */
   public static final int NOTFETCHING          = 20;
   /** Unchanged since the last fetch. */
   public static final int NOTMODIFIED          = 21;
+  /** Request was refused by protocol plugins, because it would block.
+   * The expected number of milliseconds to wait before retry may be provided
+   * in args. */
+  public static final int WOULDBLOCK           = 22;
   
   // Useful static instances for status codes that don't usually require any
   // additional arguments.
@@ -72,6 +76,7 @@
   public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED);
   public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
   public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
+  public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK);
   
   private int code;
   private long lastModified;
@@ -93,6 +98,7 @@
     codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded");
     codeToName.put(new Integer(NOTFETCHING), "notfetching");
     codeToName.put(new Integer(NOTMODIFIED), "notmodified");
+    codeToName.put(new Integer(WOULDBLOCK), "wouldblock");
   }
   
   public ProtocolStatus() {
@@ -183,6 +189,7 @@
         code == REDIR_EXCEEDED ||
         code == RETRY ||
         code == TEMP_MOVED ||
+        code == WOULDBLOCK ||
         code == PROTO_NOT_FOUND; 
   }
   

Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Aug 14 08:08:14 2006
@@ -125,6 +125,9 @@
  
   /** Do we use HTTP/1.1? */
   protected boolean useHttp11 = false;
+  
+  /** Skip page if Crawl-Delay longer than this value. */
+  protected long maxCrawlDelay = -1L;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -152,6 +155,7 @@
         this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
                 .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
         this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
+        this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000);
         // backward-compatible default setting
         this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
         this.useHttp11 = conf.getBoolean("http.http11", false);
@@ -185,6 +189,14 @@
       
       long crawlDelay = robots.getCrawlDelay(this, u);
       long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
+      if (maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
+        // skip this page, otherwise the thread would block for too long.
+        LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
+                + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
+        Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT,
+                null, null, this.conf);
+        return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK);
+      }
       String host = blockAddr(u, delay);
       Response response;
       try {