You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/09/03 15:37:25 UTC
svn commit: r572335 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/parse/
src/test/org/apache/nutch/crawl/
Author: dogacan
Date: Mon Sep 3 06:37:24 2007
New Revision: 572335
URL: http://svn.apache.org/viewvc?rev=572335&view=rev
Log:
NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. Contributed by Emmanuel Joke.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Sep 3 06:37:24 2007
@@ -126,6 +126,9 @@
42. NUTCH-545 - Configuration and OnlineClusterer get initialized in every
request. (Dawid Weiss via dogacan)
+43. NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time.
+ (Emmanuel Joke via dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Sep 3 06:37:24 2007
@@ -33,8 +33,8 @@
public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
- private float defaultInterval;
- private float maxInterval;
+ private int defaultInterval;
+ private int maxInterval;
public AbstractFetchSchedule() {
super(null);
@@ -48,9 +48,11 @@
super.setConf(conf);
if (conf == null) return;
int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);
- defaultInterval = conf.getFloat("db.fetch.interval.default", 0);
+ defaultInterval = conf.getInt("db.fetch.interval.default", 0);
if (oldDefaultInterval > 0 && defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY;
- maxInterval = conf.getFloat("db.fetch.interval.max", 30.0f * SECONDS_PER_DAY);
+ int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0);
+ maxInterval = conf.getInt("db.fetch.interval.max", 0 );
+ if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
LOG.info("defaultInterval=" + defaultInterval);
LOG.info("maxInterval=" + maxInterval);
}
@@ -91,7 +93,7 @@
// no page is truly GONE ... just increase the interval by 50%
// and try much later.
datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
- datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d));
+ datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
return datum;
}
@@ -117,6 +119,14 @@
}
/**
+ * This method return the last fetch time of the CrawlDatum
+ * @return the date as a long.
+ */
+ public long calculateLastFetchTime(CrawlDatum datum){
+ return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
+ }
+
+ /**
* This method provides information whether the page is suitable for
* selection in the current fetchlist. NOTE: a true return value does not
* guarantee that the page will be fetched, it just allows it to be
@@ -136,7 +146,7 @@
// pages are never truly GONE - we have to check them from time to time.
// pages with too long fetchInterval are adjusted so that they fit within
// maximum fetchInterval (segment retention period).
- if (datum.getFetchTime() - curTime > maxInterval * 1000) {
+ if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
datum.setFetchInterval(maxInterval * 0.9f);
datum.setFetchTime(curTime);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Sep 3 06:37:24 2007
@@ -57,9 +57,9 @@
private float DEC_RATE;
- private float MAX_INTERVAL;
+ private int MAX_INTERVAL;
- private float MIN_INTERVAL;
+ private int MIN_INTERVAL;
private boolean SYNC_DELTA;
@@ -70,8 +70,8 @@
if (conf == null) return;
INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
- MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
- MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year
+ MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
+ MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year
SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
}
@@ -101,7 +101,7 @@
}
if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
- datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval()));
+ datum.setFetchTime(refTime + (long)datum.getFetchInterval() * 1000 );
datum.setModifiedTime(modifiedTime);
return datum;
}
@@ -134,14 +134,14 @@
lastModified = curTime;
}
System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss);
+ + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
fs.setFetchSchedule(new Text("http://www.example.com"), p,
p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / (float) (3600 * 24)) + " days");
+ + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
if (!changed) miss++;
if (miss > maxMiss) maxMiss = miss;
changed = false;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Mon Sep 3 06:37:24 2007
@@ -29,7 +29,7 @@
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
- private final static byte CUR_VERSION = 5;
+ private final static byte CUR_VERSION = 6;
/** Compatibility values for on-the-fly conversion from versions < 5. */
private static final byte OLD_STATUS_SIGNATURE = 0;
@@ -114,7 +114,7 @@
private byte status;
private long fetchTime = System.currentTimeMillis();
private byte retries;
- private float fetchInterval;
+ private int fetchInterval;
private float score = 1.0f;
private byte[] signature = null;
private long modifiedTime;
@@ -134,12 +134,12 @@
metaData = new MapWritable();
}
- public CrawlDatum(int status, float fetchInterval) {
+ public CrawlDatum(int status, int fetchInterval) {
this.status = (byte)status;
this.fetchInterval = fetchInterval;
}
- public CrawlDatum(int status, float fetchInterval, float score) {
+ public CrawlDatum(int status, int fetchInterval, float score) {
this(status, fetchInterval);
this.score = score;
}
@@ -172,10 +172,13 @@
public byte getRetriesSinceFetch() { return retries; }
public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
- public float getFetchInterval() { return fetchInterval; }
- public void setFetchInterval(float fetchInterval) {
+ public int getFetchInterval() { return fetchInterval; }
+ public void setFetchInterval(int fetchInterval) {
this.fetchInterval = fetchInterval;
}
+ public void setFetchInterval(float fetchInterval) {
+ this.fetchInterval = Math.round(fetchInterval);
+ }
public float getScore() { return score; }
public void setScore(float score) { this.score = score; }
@@ -221,7 +224,9 @@
status = in.readByte();
fetchTime = in.readLong();
retries = in.readByte();
- fetchInterval = in.readFloat();
+ if (version > 5) {
+ fetchInterval = in.readInt();
+ } else fetchInterval = Math.round(in.readFloat());
score = in.readFloat();
if (version > 2) {
modifiedTime = in.readLong();
@@ -256,7 +261,7 @@
out.writeByte(status);
out.writeLong(fetchTime);
out.writeByte(retries);
- out.writeFloat(fetchInterval);
+ out.writeInt(fetchInterval);
out.writeFloat(score);
out.writeLong(modifiedTime);
if (signature == null) {
@@ -330,8 +335,8 @@
int retries2 = b2[s2+1+1+8];
if (retries2 != retries1)
return retries2 - retries1;
- float fetchInterval1 = readFloat(b1, s1+1+1+8+1);
- float fetchInterval2 = readFloat(b2, s2+1+1+8+1);
+ int fetchInterval1 = readInt(b1, s1+1+1+8+1);
+ int fetchInterval2 = readInt(b2, s2+1+1+8+1);
if (fetchInterval2 != fetchInterval1)
return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
@@ -409,7 +414,7 @@
((int)fetchTime) ^
((int)modifiedTime) ^
retries ^
- Float.floatToIntBits(fetchInterval) ^
+ fetchInterval ^
Float.floatToIntBits(score);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Mon Sep 3 06:37:24 2007
@@ -55,10 +55,12 @@
public static class Merger extends MapReduceBase implements Reducer {
MapWritable meta = new MapWritable();
+ private FetchSchedule schedule;
public void close() throws IOException {}
public void configure(JobConf conf) {
+ schedule = FetchScheduleFactory.getFetchSchedule(conf);
}
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
@@ -70,17 +72,17 @@
CrawlDatum val = (CrawlDatum) values.next();
if (res == null) {
res = val;
- resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000);
+ resTime = schedule.calculateLastFetchTime(res);
meta.putAll(res.getMetaData());
continue;
}
// compute last fetch time, and pick the latest
- long valTime = val.getFetchTime() - Math.round(val.getFetchInterval() * 3600 * 24 * 1000);
+ long valTime = schedule.calculateLastFetchTime(val);
if (valTime > resTime) {
// collect all metadata, newer values override older values
meta.putAll(val.getMetaData());
res = val;
- resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000);
+ resTime = valTime ;
} else {
// insert older metadata before newer
val.getMetaData().putAll(meta);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Sep 3 06:37:24 2007
@@ -40,14 +40,16 @@
private ArrayList<CrawlDatum> linked = new ArrayList<CrawlDatum>();
private ScoringFilters scfilters = null;
private boolean additionsAllowed;
- private float maxInterval;
+ private int maxInterval;
private FetchSchedule schedule;
public void configure(JobConf job) {
retryMax = job.getInt("db.fetch.retry.max", 3);
scfilters = new ScoringFilters(job);
additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
- maxInterval = (float)(job.getInt("db.max.fetch.interval", 30) * 3600 * 24);
+ int oldMaxInterval = job.getInt("db.max.fetch.interval", 0);
+ maxInterval = job.getInt("db.fetch.interval.max", 0 );
+ if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
schedule = FetchScheduleFactory.getFetchSchedule(job);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Sep 3 06:37:24 2007
@@ -32,7 +32,7 @@
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
- datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d));
+ datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
datum.setModifiedTime(modifiedTime);
return datum;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Sep 3 06:37:24 2007
@@ -35,7 +35,7 @@
/** Page is known to remain unmodified since our last visit. */
public static final int STATUS_NOTMODIFIED = 2;
- public static final float SECONDS_PER_DAY = 3600.0f * 24.0f;
+ public static final int SECONDS_PER_DAY = 3600 * 24;
/**
* Initialize fetch schedule related data. Implementations should at least
* set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
@@ -111,6 +111,12 @@
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime);
+ /**
+ * Calculates last fetch time of the given CrawlDatum.
+ * @return the date as a long.
+ */
+ public long calculateLastFetchTime(CrawlDatum datum);
+
/**
* This method provides information whether the page is suitable for
* selection in the current fetchlist. NOTE: a true return value does not
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Mon Sep 3 06:37:24 2007
@@ -46,7 +46,7 @@
/** Normalize and filter injected urls. */
public static class InjectMapper implements Mapper {
private URLNormalizers urlNormalizers;
- private float interval;
+ private int interval;
private float scoreInjected;
private JobConf jobConf;
private URLFilters filters;
@@ -57,7 +57,7 @@
public void configure(JobConf job) {
this.jobConf = job;
urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
- interval = jobConf.getFloat("db.fetch.interval.default", 2592000.0f);
+ interval = jobConf.getInt("db.fetch.interval.default", 2592000);
filters = new URLFilters(jobConf);
scfilters = new ScoringFilters(jobConf);
scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Sep 3 06:37:24 2007
@@ -82,7 +82,7 @@
this.filters = new URLFilters(job);
this.scfilters = new ScoringFilters(job);
final UrlValidator validator = UrlValidator.get();
- final float interval = job.getFloat("db.fetch.interval.default", 2592000.0f);
+ final int interval = job.getInt("db.fetch.interval.default", 2592000);
final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100);
final CompressionType compType = SequenceFile.getCompressionType(job);
@@ -125,7 +125,7 @@
byte[] signature = StringUtil.fromHexString(sig);
if (signature != null) {
// append a CrawlDatum with a signature
- CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
+ CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
d.setSignature(signature);
crawlOut.append(key, d);
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Sep 3 06:37:24 2007
@@ -341,7 +341,7 @@
* @return Constructed object
*/
private URLCrawlDatum createURLCrawlDatum(final String url,
- final float fetchInterval, final float score) {
+ final int fetchInterval, final float score) {
return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Mon Sep 3 06:37:24 2007
@@ -84,7 +84,7 @@
assertEquals(100, datum2.size());
testWritable(datum2);
- CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f);
+ CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1);
c.setMetaData(new MapWritable());
for (int i = 0; i < 100; i++) {
c.getMetaData().put(new LongWritable(i), new Text("" + 1));