You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/08/14 17:08:15 UTC
svn commit: r431366 - in /lucene/nutch/branches/branch-0.8: ./ conf/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
Author: ab
Date: Mon Aug 14 08:08:14 2006
New Revision: 431366
URL: http://svn.apache.org/viewvc?rev=431366&view=rev
Log:
Apply patches in rev 431364.
Modified:
lucene/nutch/branches/branch-0.8/CHANGES.txt
lucene/nutch/branches/branch-0.8/conf/nutch-default.xml
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/CHANGES.txt (original)
+++ lucene/nutch/branches/branch-0.8/CHANGES.txt Mon Aug 14 08:08:14 2006
@@ -10,6 +10,9 @@
3. NUTCH-344 - Fix for thread blocking issue (Greg Kim via siren)
+ 4. Optionally skip pages with abnormally large Crawl-Delay values
+ (Dennis Kubes via ab)
+
Release 0.8 - 2006-07-25
0. Totally new architecture, based on hadoop
Modified: lucene/nutch/branches/branch-0.8/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/conf/nutch-default.xml?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/branch-0.8/conf/nutch-default.xml Mon Aug 14 08:08:14 2006
@@ -380,6 +380,18 @@
</property>
<property>
+ <name>fetcher.max.crawl.delay</name>
+ <value>30</value>
+ <description>
+ If the Crawl-Delay in robots.txt is set to greater than this value (in
+ seconds) then the fetcher will skip this page, generating an error report.
+ If set to -1 the fetcher will never skip such pages and will wait the
+ amount of time retrieved from robots.txt Crawl-Delay, however long that
+ might be.
+ </description>
+</property>
+
+<property>
<name>fetcher.threads.fetch</name>
<value>10</value>
<description>The number of FetcherThreads the fetcher should use.
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Aug 14 08:08:14 2006
@@ -199,6 +199,7 @@
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
+ case ProtocolStatus.WOULDBLOCK:
case ProtocolStatus.NOTMODIFIED:
output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
break;
Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/protocol/ProtocolStatus.java Mon Aug 14 08:08:14 2006
@@ -25,7 +25,7 @@
import org.apache.hadoop.io.WritableUtils;
/**
- * @author Andrzej Bialecki <ab@getopt.org>
+ * @author Andrzej Bialecki
*/
public class ProtocolStatus extends VersionedWritable {
@@ -55,11 +55,15 @@
/** Access denied by robots.txt rules. */
public static final int ROBOTS_DENIED = 18;
/** Too many redirects. */
- public static final int REDIR_EXCEEDED = 19;
+ public static final int REDIR_EXCEEDED = 19;
/** Not fetching. */
public static final int NOTFETCHING = 20;
/** Unchanged since the last fetch. */
public static final int NOTMODIFIED = 21;
+ /** Request was refused by protocol plugins, because it would block.
+ * The expected number of milliseconds to wait before retry may be provided
+ * in args. */
+ public static final int WOULDBLOCK = 22;
// Useful static instances for status codes that don't usually require any
// additional arguments.
@@ -72,6 +76,7 @@
public static final ProtocolStatus STATUS_REDIR_EXCEEDED = new ProtocolStatus(REDIR_EXCEEDED);
public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
+ public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK);
private int code;
private long lastModified;
@@ -93,6 +98,7 @@
codeToName.put(new Integer(REDIR_EXCEEDED), "redir_exceeded");
codeToName.put(new Integer(NOTFETCHING), "notfetching");
codeToName.put(new Integer(NOTMODIFIED), "notmodified");
+ codeToName.put(new Integer(WOULDBLOCK), "wouldblock");
}
public ProtocolStatus() {
@@ -183,6 +189,7 @@
code == REDIR_EXCEEDED ||
code == RETRY ||
code == TEMP_MOVED ||
+ code == WOULDBLOCK ||
code == PROTO_NOT_FOUND;
}
Modified: lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=431366&r1=431365&r2=431366&view=diff
==============================================================================
--- lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/branches/branch-0.8/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Aug 14 08:08:14 2006
@@ -125,6 +125,9 @@
/** Do we use HTTP/1.1? */
protected boolean useHttp11 = false;
+
+ /** Skip page if Crawl-Delay longer than this value. */
+ protected long maxCrawlDelay = -1L;
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -152,6 +155,7 @@
this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf
.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
this.serverDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
+ this.maxCrawlDelay = (long)(conf.getInt("fetcher.max.crawl.delay", -1) * 1000);
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
this.useHttp11 = conf.getBoolean("http.http11", false);
@@ -185,6 +189,14 @@
long crawlDelay = robots.getCrawlDelay(this, u);
long delay = crawlDelay > 0 ? crawlDelay : serverDelay;
+ if (maxCrawlDelay >= 0 && delay > maxCrawlDelay) {
+ // skip this page, otherwise the thread would block for too long.
+ LOGGER.info("Skipping: " + u + " exceeds fetcher.max.crawl.delay, max="
+ + (maxCrawlDelay / 1000) + ", Crawl-Delay=" + (delay / 1000));
+ Content c = new Content(u.toString(), u.toString(), EMPTY_CONTENT,
+ null, null, this.conf);
+ return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK);
+ }
String host = blockAddr(u, delay);
Response response;
try {