You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/17 13:42:56 UTC

svn commit: r637861 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/

Author: ab
Date: Mon Mar 17 05:42:54 2008
New Revision: 637861

URL: http://svn.apache.org/viewvc?rev=637861&view=rev
Log:
NUTCH-616 Reset Fetch Retry counter when fetch is successful.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:42:54 2008
@@ -232,6 +232,9 @@
 84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval.
     Guard against reprUrl being null. (Emmanuel Joke, ab)
 
+85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel
+    Joke, ab)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -33,8 +33,8 @@
 public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
   private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
   
-  private int defaultInterval;
-  private int maxInterval;
+  protected int defaultInterval;
+  protected int maxInterval;
   
   public AbstractFetchSchedule() {
     super(null);
@@ -69,12 +69,22 @@
   public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
     datum.setFetchTime(System.currentTimeMillis());
     datum.setFetchInterval(defaultInterval);
+    datum.setRetriesSinceFetch(0);
     return datum;
   }
   
-  public abstract CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+  /**
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+   * successfully fetched page. NOTE: this implementation resets the
+   * retry counter - extending classes should call super.setFetchSchedule() to
+   * preserve this behavior.
+   */
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime,
-          long fetchTime, long modifiedTime, int state);
+          long fetchTime, long modifiedTime, int state) {
+    datum.setRetriesSinceFetch(0);
+    return datum;
+  }
   
   /**
    * This method specifies how to schedule refetching of pages
@@ -101,7 +111,8 @@
   /**
    * This method adjusts the fetch schedule if fetching needs to be
    * re-tried due to transient errors. The default implementation
-   * sets the next fetch time 1 day in the future.
+   * sets the next fetch time 1 day in the future and increases
+   * the retry counter.
    * @param url URL of the page
    * @param datum page information
    * @param prevFetchTime previous fetch time
@@ -115,6 +126,7 @@
   public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
     datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY);
+    datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
     return datum;
   }
   
@@ -122,7 +134,7 @@
    * This method return the last fetch time of the CrawlDatum
    * @return the date as a long.
    */
-  public long calculateLastFetchTime(CrawlDatum datum){
+  public long calculateLastFetchTime(CrawlDatum datum) {
     return  datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
   }
 
@@ -157,8 +169,8 @@
   }
   
   /**
-   * This method resets fetchTime, fetchInterval, modifiedTime and
-   * page signature, so that it forces refetching.
+   * This method resets fetchTime, fetchInterval, modifiedTime,
+   * retriesSinceFetch and page signature, so that it forces refetching.
    * @param url URL of the page
    * @param datum datum instance
    * @param asap if true, force refetch as soon as possible - this sets
@@ -170,6 +182,7 @@
     if (datum.getFetchInterval() > maxInterval)
       datum.setFetchInterval(maxInterval * 0.9f);
     datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+    datum.setRetriesSinceFetch(0);
     datum.setSignature(null);
     datum.setModifiedTime(0L);
     if (asap) datum.setFetchTime(System.currentTimeMillis());

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -76,9 +76,12 @@
     SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
   }
 
+  @Override
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
+    super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
     long refTime = fetchTime;
     if (modifiedTime <= 0) modifiedTime = fetchTime;
     float interval = datum.getFetchInterval();

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Mar 17 05:42:54 2008
@@ -210,15 +210,16 @@
       }   
       return;
     case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
-      if (old != null)
+      if (old != null) {
         result.setSignature(old.getSignature());  // use old signature
-      if (fetch.getRetriesSinceFetch() < retryMax) {
+      }
+      result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime());
+      if (result.getRetriesSinceFetch() < retryMax) {
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
       } else {
         result.setStatus(CrawlDatum.STATUS_DB_GONE);
       }
-      result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime,
-          prevModifiedTime, fetch.getFetchTime());
       break;
 
     case CrawlDatum.STATUS_FETCH_GONE:            // permanent failure

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -29,9 +29,12 @@
  */
 public class DefaultFetchSchedule extends AbstractFetchSchedule {
 
+  @Override
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
+    datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+        fetchTime, modifiedTime, state);
     datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
     datum.setModifiedTime(modifiedTime);
     return datum;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -52,7 +52,8 @@
   public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
   
   /**
-   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page.
+   * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+   * successfully fetched page.
    * Implementations may use supplied arguments to support different re-fetching
    * schedules.
    * 
@@ -97,7 +98,8 @@
   /**
    * This method adjusts the fetch schedule if fetching needs to be
    * re-tried due to transient errors. The default implementation
-   * sets the next fetch time 1 day in the future.
+   * sets the next fetch time 1 day in the future and increases the
+   * retry counter.
    * @param url URL of the page
    * @param datum page information
    * @param prevFetchTime previous fetch time

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:42:54 2008
@@ -209,9 +209,6 @@
                 logError(url, status.getMessage());
               /* FALLTHROUGH */
               case ProtocolStatus.RETRY:          // retry
-                datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
-              /* FALLTHROUGH */
-              // intermittent blocking - retry without increasing the counter
               case ProtocolStatus.WOULDBLOCK:
               case ProtocolStatus.BLOCKED:
                 output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:42:54 2008
@@ -611,9 +611,6 @@
                 logError(fit.url, status.getMessage());
                 /* FALLTHROUGH */
               case ProtocolStatus.RETRY:          // retry
-                fit.datum.setRetriesSinceFetch(fit.datum.getRetriesSinceFetch()+1);
-                /* FALLTHROUGH */
-                // intermittent blocking - retry without increasing the counter
               case ProtocolStatus.BLOCKED:
                 output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                 break;