You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/04/11 21:18:54 UTC
svn commit: r393297 -
/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Author: ab
Date: Tue Apr 11 12:18:52 2006
New Revision: 393297
URL: http://svn.apache.org/viewcvs?rev=393297&view=rev
Log:
Re-instate support for content-level redirects.
Use the correct key, when FetcherThread.output() is called
after redirects. It should use the new URL as the key,
and not the original one.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=393297&r1=393296&r2=393297&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Apr 11 12:18:52 2006
@@ -80,6 +80,7 @@
private Configuration conf;
private URLFilters urlFilters;
private ParseUtil parseUtil;
+ private UrlNormalizer normalizer;
private ProtocolFactory protocolFactory;
public FetcherThread(Configuration conf) {
@@ -89,6 +90,7 @@
this.urlFilters = new URLFilters(conf);
this.parseUtil = new ParseUtil(conf);
this.protocolFactory = new ProtocolFactory(conf);
+ this.normalizer = new UrlNormalizerFactory(conf).getNormalizer();
}
public void run() {
@@ -117,7 +119,8 @@
}
// url may be changed through redirects.
- String url = key.toString();
+ UTF8 url = new UTF8();
+ url.set(key);
try {
LOG.info("fetching " + url); // fetch the page
@@ -126,30 +129,47 @@
do {
redirecting = false;
LOG.fine("redirectCount=" + redirectCount);
- Protocol protocol = this.protocolFactory.getProtocol(url);
- ProtocolOutput output = protocol.getProtocolOutput(new UTF8(url), datum);
+ Protocol protocol = this.protocolFactory.getProtocol(url.toString());
+ ProtocolOutput output = protocol.getProtocolOutput(url, datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
+ ParseStatus pstatus = null;
switch(status.getCode()) {
case ProtocolStatus.SUCCESS: // got a page
- output(key, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
+ pstatus = output(url, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
updateStatus(content.getContent().length);
+ if (pstatus != null && pstatus.isSuccess() &&
+ pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+ String newUrl = pstatus.getMessage();
+ newUrl = normalizer.normalize(newUrl);
+ newUrl = this.urlFilters.filter(newUrl);
+ if (newUrl != null && !newUrl.equals(url.toString())) {
+ url = new UTF8(newUrl);
+ redirecting = true;
+ redirectCount++;
+ LOG.fine(" - content redirect to " + url);
+ } else {
+ LOG.fine(" - content redirect skipped: " +
+ (url.equals(newUrl.toString()) ? "to same url" : "filtered"));
+ }
+ }
break;
case ProtocolStatus.MOVED: // redirect
case ProtocolStatus.TEMP_MOVED:
String newUrl = status.getMessage();
+ newUrl = normalizer.normalize(newUrl);
newUrl = this.urlFilters.filter(newUrl);
- if (newUrl != null && !newUrl.equals(url)) {
- url = newUrl;
+ if (newUrl != null && !newUrl.equals(url.toString())) {
+ url = new UTF8(newUrl);
redirecting = true;
redirectCount++;
LOG.fine(" - protocol redirect to " + url);
} else {
LOG.fine(" - protocol redirect skipped: " +
- (url.equals(newUrl) ? "to same url" : "filtered"));
+ (url.equals(newUrl.toString()) ? "to same url" : "filtered"));
}
break;
@@ -157,7 +177,7 @@
logError(url, status.getMessage());
case ProtocolStatus.RETRY: // retry
datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
- output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
+ output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
break;
case ProtocolStatus.GONE: // gone
@@ -165,17 +185,17 @@
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
case ProtocolStatus.NOTMODIFIED:
- output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
break;
default:
LOG.warning("Unknown ProtocolStatus: " + status.getCode());
- output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
}
if (redirecting && redirectCount >= maxRedirect) {
LOG.info(" - redirect count exceeded " + url);
- output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount < maxRedirect));
@@ -183,7 +203,7 @@
} catch (Throwable t) { // unexpected exception
logError(url, t.toString());
- output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
}
}
@@ -196,14 +216,14 @@
}
}
- private void logError(String url, String message) {
+ private void logError(UTF8 url, String message) {
LOG.info("fetch of " + url + " failed with: " + message);
synchronized (Fetcher.this) { // record failure
errors++;
}
}
- private void output(UTF8 key, CrawlDatum datum,
+ private ParseStatus output(UTF8 key, CrawlDatum datum,
Content content, int status) {
datum.setStatus(status);
@@ -252,6 +272,8 @@
e.printStackTrace();
LOG.severe("fetcher caught:"+e.toString());
}
+ if (parse != null) return parse.getData().getStatus();
+ else return null;
}
}