You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2013/06/19 23:26:07 UTC
svn commit: r1494776 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java

Author: snagel
Date: Wed Jun 19 21:26:07 2013
New Revision: 1494776

URL: http://svn.apache.org/r1494776
Log:
NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in CrawlDb and is generated over and over again

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1494776&r1=1494775&r2=1494776&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jun 19 21:26:07 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1245 URL gone with 404 after db.fetch.interval.max stays db_unfetched in CrawlDb (snagel)
+
 * NUTCH-1527 Elasticsearch indexer (lufeng + markus)
 
 * NUTCH-1475 Index-More Plugin -- A better fall back value for date field (James Sullivan, snagel via lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1494776&r1=1494775&r2=1494776&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Wed Jun 19 21:26:07 2013
@@ -85,9 +85,8 @@ public abstract class AbstractFetchSched
   
   /**
    * This method specifies how to schedule refetching of pages
-   * marked as GONE. Default implementation increases fetchInterval by 50%,
-   * and if it exceeds the <code>maxInterval</code> it calls
-   * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+   * marked as GONE. Default implementation increases fetchInterval by 50%
+   * but the value may never exceed <code>maxInterval</code>.
    *
    * @param url URL of the page.
    *
@@ -102,9 +101,11 @@ public abstract class AbstractFetchSched
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
     // no page is truly GONE ... just increase the interval by 50%
     // and try much later.
-    datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+    if ((datum.getFetchInterval() * 1.5f) < maxInterval)
+      datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
+    else
+      datum.setFetchInterval(maxInterval * 0.9f);
     datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
-    if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
     return datum;
   }