You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/04 12:38:48 UTC

svn commit: r492525 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/Crawl.java fetcher/Fetcher.java

Author: ab
Date: Thu Jan  4 03:38:45 2007
New Revision: 492525

URL: http://svn.apache.org/viewvc?view=rev&rev=492525
Log:
Use different status code when recording a redirected terget URL without
fetching. Fix also an NPE in Crawl when Generator doesn't produce any
new segment. Reported by Meghna Kukreja.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Jan  4 03:38:45 2007
@@ -117,6 +117,10 @@
     for (int i = 0; i < depth; i++) {             // generate new segment
       Path segment = generator.generate(crawlDb, segments, -1, topN, System
           .currentTimeMillis(), false, false);
+      if (segment == null) {
+        LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
+        break;
+      }
       fetcher.fetch(segment, threads);  // fetch it
       if (!Fetcher.isParsing(job)) {
         parseSegment.parse(segment);    // parse it, if needed

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=492525&r1=492524&r2=492525
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan  4 03:38:45 2007
@@ -157,6 +157,8 @@
                   newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                   newUrl = this.urlFilters.filter(newUrl);
                   if (newUrl != null && !newUrl.equals(url.toString())) {
+                    // record that we were redirected
+                    output(url, datum, null, status, CrawlDatum.STATUS_FETCH_REDIR_PERM);
                     url = new Text(newUrl);
                     if (maxRedirect > 0) {
                       redirecting = true;
@@ -165,7 +167,7 @@
                         LOG.debug(" - content redirect to " + url + " (fetching now)");
                       }
                     } else {
-                      output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_FETCH_REDIR_TEMP);
+                      output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                       if (LOG.isDebugEnabled()) {
                         LOG.debug(" - content redirect to " + url + " (fetching later)");
                       }
@@ -198,7 +200,7 @@
                       LOG.debug(" - protocol redirect to " + url + " (fetching now)");
                     }
                   } else {
-                    output(url, new CrawlDatum(), null, null, code);
+                    output(url, new CrawlDatum(), null, null, CrawlDatum.STATUS_LINKED);
                     if (LOG.isDebugEnabled()) {
                       LOG.debug(" - protocol redirect to " + url + " (fetching later)");
                     }