You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/17 13:42:56 UTC
svn commit: r637861 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
Author: ab
Date: Mon Mar 17 05:42:54 2008
New Revision: 637861
URL: http://svn.apache.org/viewvc?rev=637861&view=rev
Log:
NUTCH-616 Reset Fetch Retry counter when fetch is successful.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:42:54 2008
@@ -232,6 +232,9 @@
84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval.
Guard against reprUrl being null. (Emmanuel Joke, ab)
+85. NUTCH-616 - Reset Fetch Retry counter when fetch is successful (Emmanuel
+ Joke, ab)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -33,8 +33,8 @@
public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
- private int defaultInterval;
- private int maxInterval;
+ protected int defaultInterval;
+ protected int maxInterval;
public AbstractFetchSchedule() {
super(null);
@@ -69,12 +69,22 @@
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
datum.setFetchTime(System.currentTimeMillis());
datum.setFetchInterval(defaultInterval);
+ datum.setRetriesSinceFetch(0);
return datum;
}
- public abstract CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ /**
+ * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+ * successfully fetched page. NOTE: this implementation resets the
+ * retry counter - extending classes should call super.setFetchSchedule() to
+ * preserve this behavior.
+ */
+ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state);
+ long fetchTime, long modifiedTime, int state) {
+ datum.setRetriesSinceFetch(0);
+ return datum;
+ }
/**
* This method specifies how to schedule refetching of pages
@@ -101,7 +111,8 @@
/**
* This method adjusts the fetch schedule if fetching needs to be
* re-tried due to transient errors. The default implementation
- * sets the next fetch time 1 day in the future.
+ * sets the next fetch time 1 day in the future and increases
+ * the retry counter.
* @param url URL of the page
* @param datum page information
* @param prevFetchTime previous fetch time
@@ -115,6 +126,7 @@
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime) {
datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY);
+ datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
return datum;
}
@@ -122,7 +134,7 @@
* This method return the last fetch time of the CrawlDatum
* @return the date as a long.
*/
- public long calculateLastFetchTime(CrawlDatum datum){
+ public long calculateLastFetchTime(CrawlDatum datum) {
return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
}
@@ -157,8 +169,8 @@
}
/**
- * This method resets fetchTime, fetchInterval, modifiedTime and
- * page signature, so that it forces refetching.
+ * This method resets fetchTime, fetchInterval, modifiedTime,
+ * retriesSinceFetch and page signature, so that it forces refetching.
* @param url URL of the page
* @param datum datum instance
* @param asap if true, force refetch as soon as possible - this sets
@@ -170,6 +182,7 @@
if (datum.getFetchInterval() > maxInterval)
datum.setFetchInterval(maxInterval * 0.9f);
datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ datum.setRetriesSinceFetch(0);
datum.setSignature(null);
datum.setModifiedTime(0L);
if (asap) datum.setFetchTime(System.currentTimeMillis());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -76,9 +76,12 @@
SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
}
+ @Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
+ super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ fetchTime, modifiedTime, state);
long refTime = fetchTime;
if (modifiedTime <= 0) modifiedTime = fetchTime;
float interval = datum.getFetchInterval();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Mar 17 05:42:54 2008
@@ -210,15 +210,16 @@
}
return;
case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
- if (old != null)
+ if (old != null) {
result.setSignature(old.getSignature()); // use old signature
- if (fetch.getRetriesSinceFetch() < retryMax) {
+ }
+ result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime,
+ prevModifiedTime, fetch.getFetchTime());
+ if (result.getRetriesSinceFetch() < retryMax) {
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
} else {
result.setStatus(CrawlDatum.STATUS_DB_GONE);
}
- result = schedule.setPageRetrySchedule((Text)key, result, prevFetchTime,
- prevModifiedTime, fetch.getFetchTime());
break;
case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -29,9 +29,12 @@
*/
public class DefaultFetchSchedule extends AbstractFetchSchedule {
+ @Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
+ datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ fetchTime, modifiedTime, state);
datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
datum.setModifiedTime(modifiedTime);
return datum;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Mar 17 05:42:54 2008
@@ -52,7 +52,8 @@
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
/**
- * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a page.
+ * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
+ * successfully fetched page.
* Implementations may use supplied arguments to support different re-fetching
* schedules.
*
@@ -97,7 +98,8 @@
/**
* This method adjusts the fetch schedule if fetching needs to be
* re-tried due to transient errors. The default implementation
- * sets the next fetch time 1 day in the future.
+ * sets the next fetch time 1 day in the future and increases the
+ * retry counter.
* @param url URL of the page
* @param datum page information
* @param prevFetchTime previous fetch time
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:42:54 2008
@@ -209,9 +209,6 @@
logError(url, status.getMessage());
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
- datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
- /* FALLTHROUGH */
- // intermittent blocking - retry without increasing the counter
case ProtocolStatus.WOULDBLOCK:
case ProtocolStatus.BLOCKED:
output(url, datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637861&r1=637860&r2=637861&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:42:54 2008
@@ -611,9 +611,6 @@
logError(fit.url, status.getMessage());
/* FALLTHROUGH */
case ProtocolStatus.RETRY: // retry
- fit.datum.setRetriesSinceFetch(fit.datum.getRetriesSinceFetch()+1);
- /* FALLTHROUGH */
- // intermittent blocking - retry without increasing the counter
case ProtocolStatus.BLOCKED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;