You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/04/03 15:35:35 UTC
svn commit: r391044 -
/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: ab
Date: Mon Apr 3 06:35:34 2006
New Revision: 391044
URL: http://svn.apache.org/viewcvs?rev=391044&view=rev
Log:
Make sure we use new values for score, metadata, fetch interval
and fetch time.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=391044&r1=391043&r2=391044&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Apr 3 06:35:34 2006
@@ -25,6 +25,7 @@
/** Merge new page entries with existing entries. */
public class CrawlDbReducer implements Reducer {
private int retryMax;
+ private CrawlDatum result = new CrawlDatum();
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
@@ -61,36 +62,45 @@
}
}
- CrawlDatum result = null;
+ // initialize with the latest version
+ result.set(highest);
+ if (old != null) {
+ // copy metadata from old, if exists
+ if (old.getMetaData() != null) {
+ result.getMetaData().putAll(old.getMetaData());
+ // overlay with new, if any
+ if (highest.getMetaData() != null)
+ result.getMetaData().putAll(highest.getMetaData());
+ }
+ // set the most recent valid value of modifiedTime
+ if (old.getModifiedTime() > 0 && highest.getModifiedTime() == 0) {
+ result.setModifiedTime(old.getModifiedTime());
+ }
+ }
switch (highest.getStatus()) { // determine new status
case CrawlDatum.STATUS_DB_UNFETCHED: // no new entry
case CrawlDatum.STATUS_DB_FETCHED:
case CrawlDatum.STATUS_DB_GONE:
- result = old; // use old
+ result.set(old); // use old
break;
case CrawlDatum.STATUS_LINKED: // highest was link
if (old != null) { // if old exists
- result = old; // use it
+ result.set(old); // use it
} else {
- result = highest; // use new entry
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
- result.setScore(1.0f); // initial score is 1.0f
}
- result.setSignature(null); // reset the signature
break;
case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
- result = highest; // use new entry
- if (highest.getSignature() == null) highest.setSignature(signature);
+ if (highest.getSignature() == null) result.setSignature(signature);
result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
result.setNextFetchTime();
break;
case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
- result = highest; // use new entry
if (old != null)
result.setSignature(old.getSignature()); // use old signature
if (highest.getRetriesSinceFetch() < retryMax) {
@@ -101,7 +111,6 @@
break;
case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
- result = highest; // use new entry
if (old != null)
result.setSignature(old.getSignature()); // use old signature
result.setStatus(CrawlDatum.STATUS_DB_GONE);
@@ -111,10 +120,8 @@
throw new RuntimeException("Unknown status: "+highest.getStatus());
}
- if (result != null) {
- result.setScore(result.getScore() + scoreIncrement);
- output.collect(key, result);
- }
+ result.setScore(result.getScore() + scoreIncrement);
+ output.collect(key, result);
}
}