You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/04 12:38:48 UTC
svn commit: r492525 - in /lucene/nutch/trunk/src/java/org/apache/nutch:
crawl/Crawl.java fetcher/Fetcher.java
Author: ab
Date: Thu Jan 4 03:38:45 2007
New Revision: 492525
URL: http://svn.apache.org/viewvc?view=rev&rev=492525
Log:
Use different status code when recording a redirected terget URL without
fetching. Fix also an NPE in Crawl when Generator doesn't produce any
new segment. Reported by Meghna Kukreja.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan 4 03:38:45 2007
@@ -117,6 +117,10 @@
for (int i = 0; i < depth; i++) { // generate new segment
Path segment = generator.generate(crawlDb, segments, -1, topN, System
.currentTimeMillis(), false, false);
+ if (segment == null) {
+ LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
+ break;
+ }
fetcher.fetch(segment, threads); // fetch it
if (!Fetcher.isParsing(job)) {
parseSegment.parse(segment); // parse it, if needed
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan 4 03:38:45 2007
@@ -157,6 +157,8 @@
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = this.urlFilters.filter(newUrl);
if (newUrl != null && !newUrl.equals(url.toString())) {
+ // record that we were redirected
+ output(url, datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM);
url = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
@@ -165,7 +167,7 @@
LOG.debug(" - content redirect to " + url + " (fetching now)");
}
} else {
- output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+ output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - content redirect to " + url + " (fetching later)");
}
@@ -198,7 +200,7 @@
LOG.debug(" - protocol redirect to " + url + " (fetching now)");
}
} else {
- output(url, new CrawlDatum(), null, null, code);
+ output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
if (LOG.isDebugEnabled()) {
LOG.debug(" - protocol redirect to " + url + " (fetching later)");
}