You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/09/23 21:36:48 UTC
svn commit: r449293 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
Author: ab
Date: Sat Sep 23 12:36:47 2006
New Revision: 449293
URL: http://svn.apache.org/viewvc?view=rev&rev=449293
Log:
NUTCH-350: urls incorrectly marked as STATUS_FETCH_GONE when blocked by
http.max.delays. Instead the status is set to STATUS_FETCH_RETRY. Since this
is an intermittent problem related to the Fetcher implementation, we don't
increase the retry counter.
Added:
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (with props)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Sep 23 12:36:47 2006
@@ -32,6 +32,18 @@
11. NUTCH-332 - Fix the problem of doubling scores caused by links pointing
to the current page (e.g. anchors). (Stefan Groschupf via ab)
+12. NUTCH-365 - Flexible URL normalization (ab)
+
+13. NUTCH-336 - Differentiate between newly discovered pages and newly
+ injected pages (Chris Schneider via ab) NOTE: this changes the
+ scoring API, filter implementations need to be updated.
+
+14. NUTCH-337 - Fetcher ignores the fetcher.parse value (Stefan Groschupf
+ via ab)
+
+15. NUTCH-350 - Urls blocked by http.max.delays incorrectly marked as GONE
+ (Stefan Groschupf via ab)
+
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Sep 23 12:36:47 2006
@@ -189,18 +189,24 @@
}
break;
+ // failures - increase the retry counter
case ProtocolStatus.EXCEPTION:
logError(url, status.getMessage());
+ /* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
+ /* FALLTHROUGH */
+ // intermittent blocking - retry without increasing the counter
+ case ProtocolStatus.WOULDBLOCK:
+ case ProtocolStatus.BLOCKED:
output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
break;
+ // permanent failures
case ProtocolStatus.GONE: // gone
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
- case ProtocolStatus.WOULDBLOCK:
case ProtocolStatus.NOTMODIFIED:
output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
break;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Sat Sep 23 12:36:47 2006
@@ -64,7 +64,9 @@
* The expected number of milliseconds to wait before retry may be provided
* in args. */
public static final int WOULDBLOCK = 22;
-
+ /** Thread was blocked http.max.delays times during fetching. */
+ public static final int BLOCKED = 23;
+
// Useful static instances for status codes that don't usually require any
// additional arguments.
public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS);
@@ -77,6 +79,7 @@
public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
public static final ProtocolStatus STATUS_WOULDBLOCK = new ProtocolStatus(WOULDBLOCK);
+ public static final ProtocolStatus STATUS_BLOCKED = new ProtocolStatus(BLOCKED);
private int code;
private long lastModified;
@@ -99,6 +102,7 @@
codeToName.put(new Integer(NOTFETCHING), "notfetching");
codeToName.put(new Integer(NOTMODIFIED), "notmodified");
codeToName.put(new Integer(WOULDBLOCK), "wouldblock");
+ codeToName.put(new Integer(BLOCKED), "blocked");
}
public ProtocolStatus() {
Added: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?view=auto&rev=449293
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (added)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Sat Sep 23 12:36:47 2006
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http.api;
+
+public class BlockedException extends HttpException {
+
+ public BlockedException(String msg) {
+ super(msg);
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diff&rev=449293&r1=449292&r2=449293
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Sat Sep 23 12:36:47 2006
@@ -197,7 +197,12 @@
null, null, this.conf);
return new ProtocolOutput(c, ProtocolStatus.STATUS_WOULDBLOCK);
}
- String host = blockAddr(u, delay);
+ String host;
+ try {
+ host = blockAddr(u, delay);
+ } catch (BlockedException be) {
+ return new ProtocolOutput(null, ProtocolStatus.STATUS_BLOCKED);
+ }
Response response;
try {
response = getResponse(u, datum, false); // make a request
@@ -354,7 +359,7 @@
}
if (delays == maxDelays)
- throw new HttpException("Exceeded http.max.delays: retry later.");
+ throw new BlockedException("Exceeded http.max.delays: retry later.");
long done = time.longValue();
long now = System.currentTimeMillis();