You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/09/03 15:37:25 UTC

svn commit: r572335 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/parse/ src/test/org/apache/nutch/crawl/

Author: dogacan
Date: Mon Sep  3 06:37:24 2007
New Revision: 572335

URL: http://svn.apache.org/viewvc?rev=572335&view=rev
Log:
NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. Contributed by Emmanuel Joke.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
    lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Sep  3 06:37:24 2007
@@ -126,6 +126,9 @@
 42. NUTCH-545 - Configuration and OnlineClusterer get initialized in every
     request. (Dawid Weiss via dogacan)
 
+43. NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. 
+    (Emmanuel Joke via dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Sep  3 06:37:24 2007
@@ -33,8 +33,8 @@
 public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
   private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
   
-  private float defaultInterval;
-  private float maxInterval;
+  private int defaultInterval;
+  private int maxInterval;
   
   public AbstractFetchSchedule() {
     super(null);
@@ -48,9 +48,11 @@
     super.setConf(conf);
     if (conf == null) return;
     int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);
-    defaultInterval = conf.getFloat("db.fetch.interval.default", 0);
+    defaultInterval = conf.getInt("db.fetch.interval.default", 0);
     if (oldDefaultInterval > 0 && defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY;
-    maxInterval = conf.getFloat("db.fetch.interval.max", 30.0f * SECONDS_PER_DAY);
+    int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0);
+    maxInterval = conf.getInt("db.fetch.interval.max", 0 );
+    if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
     LOG.info("defaultInterval=" + defaultInterval);
     LOG.info("maxInterval=" + maxInterval);
   }
@@ -91,7 +93,7 @@
     // no page is truly GONE ... just increase the interval by 50%
     // and try much later.
     datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
-    datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d));
+    datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
     if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
     return datum;
   }
@@ -117,6 +119,14 @@
   }
   
   /**
+   * This method return the last fetch time of the CrawlDatum
+   * @return the date as a long.
+   */
+  public long calculateLastFetchTime(CrawlDatum datum){
+    return  datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
+  }
+
+  /**
    * This method provides information whether the page is suitable for
    * selection in the current fetchlist. NOTE: a true return value does not
    * guarantee that the page will be fetched, it just allows it to be
@@ -136,7 +146,7 @@
     // pages are never truly GONE - we have to check them from time to time.
     // pages with too long fetchInterval are adjusted so that they fit within
     // maximum fetchInterval (segment retention period).
-    if (datum.getFetchTime() - curTime > maxInterval * 1000) {
+    if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
       datum.setFetchInterval(maxInterval * 0.9f);
       datum.setFetchTime(curTime);
     }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Sep  3 06:37:24 2007
@@ -57,9 +57,9 @@
 
   private float DEC_RATE;
 
-  private float MAX_INTERVAL;
+  private int MAX_INTERVAL;
 
-  private float MIN_INTERVAL;
+  private int MIN_INTERVAL;
   
   private boolean SYNC_DELTA;
 
@@ -70,8 +70,8 @@
     if (conf == null) return;
     INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
     DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
-    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f);
-    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year
+    MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
+    MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year
     SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
     SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
   }
@@ -101,7 +101,7 @@
     }
     if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
     if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
-    datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval()));
+    datum.setFetchTime(refTime + (long)datum.getFetchInterval() * 1000 );
     datum.setModifiedTime(modifiedTime);
     return datum;
   }
@@ -134,14 +134,14 @@
         lastModified = curTime;
       }
       System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-              + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss);
+              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
       if (p.getFetchTime() <= curTime) {
         fetchCnt++;
         fs.setFetchSchedule(new Text("http://www.example.com"), p,
                 p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
                 changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
         System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-                + (p.getFetchInterval() / (float) (3600 * 24)) + " days");
+                + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
         if (!changed) miss++;
         if (miss > maxMiss) maxMiss = miss;
         changed = false;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Mon Sep  3 06:37:24 2007
@@ -29,7 +29,7 @@
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
 
-  private final static byte CUR_VERSION = 5;
+  private final static byte CUR_VERSION = 6;
 
   /** Compatibility values for on-the-fly conversion from versions < 5. */
   private static final byte OLD_STATUS_SIGNATURE = 0;
@@ -114,7 +114,7 @@
   private byte status;
   private long fetchTime = System.currentTimeMillis();
   private byte retries;
-  private float fetchInterval;
+  private int fetchInterval;
   private float score = 1.0f;
   private byte[] signature = null;
   private long modifiedTime;
@@ -134,12 +134,12 @@
     metaData = new MapWritable();
   }
 
-  public CrawlDatum(int status, float fetchInterval) {
+  public CrawlDatum(int status, int fetchInterval) {
     this.status = (byte)status;
     this.fetchInterval = fetchInterval;
   }
 
-  public CrawlDatum(int status, float fetchInterval, float score) {
+  public CrawlDatum(int status, int fetchInterval, float score) {
     this(status, fetchInterval);
     this.score = score;
   }
@@ -172,10 +172,13 @@
   public byte getRetriesSinceFetch() { return retries; }
   public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
 
-  public float getFetchInterval() { return fetchInterval; }
-  public void setFetchInterval(float fetchInterval) {
+  public int getFetchInterval() { return fetchInterval; }
+  public void setFetchInterval(int fetchInterval) {
     this.fetchInterval = fetchInterval;
   }
+  public void setFetchInterval(float fetchInterval) {
+    this.fetchInterval = Math.round(fetchInterval);
+  }
 
   public float getScore() { return score; }
   public void setScore(float score) { this.score = score; }
@@ -221,7 +224,9 @@
     status = in.readByte();
     fetchTime = in.readLong();
     retries = in.readByte();
-    fetchInterval = in.readFloat();
+    if (version > 5) {
+      fetchInterval = in.readInt();
+    } else fetchInterval = Math.round(in.readFloat());
     score = in.readFloat();
     if (version > 2) {
       modifiedTime = in.readLong();
@@ -256,7 +261,7 @@
     out.writeByte(status);
     out.writeLong(fetchTime);
     out.writeByte(retries);
-    out.writeFloat(fetchInterval);
+    out.writeInt(fetchInterval);
     out.writeFloat(score);
     out.writeLong(modifiedTime);
     if (signature == null) {
@@ -330,8 +335,8 @@
       int retries2 = b2[s2+1+1+8];
       if (retries2 != retries1)
         return retries2 - retries1;
-      float fetchInterval1 = readFloat(b1, s1+1+1+8+1);
-      float fetchInterval2 = readFloat(b2, s2+1+1+8+1);
+      int fetchInterval1 = readInt(b1, s1+1+1+8+1);
+      int fetchInterval2 = readInt(b2, s2+1+1+8+1);
       if (fetchInterval2 != fetchInterval1)
         return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
       long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
@@ -409,7 +414,7 @@
       ((int)fetchTime) ^
       ((int)modifiedTime) ^
       retries ^
-      Float.floatToIntBits(fetchInterval) ^
+      fetchInterval ^
       Float.floatToIntBits(score);
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Mon Sep  3 06:37:24 2007
@@ -55,10 +55,12 @@
 
   public static class Merger extends MapReduceBase implements Reducer {
     MapWritable meta = new MapWritable();
+    private FetchSchedule schedule;
 
     public void close() throws IOException {}
 
     public void configure(JobConf conf) {
+      schedule = FetchScheduleFactory.getFetchSchedule(conf);
     }
 
     public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
@@ -70,17 +72,17 @@
         CrawlDatum val = (CrawlDatum) values.next();
         if (res == null) {
           res = val;
-          resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000);
+          resTime = schedule.calculateLastFetchTime(res);
           meta.putAll(res.getMetaData());
           continue;
         }
         // compute last fetch time, and pick the latest
-        long valTime = val.getFetchTime() - Math.round(val.getFetchInterval() * 3600 * 24 * 1000);
+        long valTime = schedule.calculateLastFetchTime(val);
         if (valTime > resTime) {
           // collect all metadata, newer values override older values
           meta.putAll(val.getMetaData());
           res = val;
-          resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000);
+          resTime = valTime ;
         } else {
           // insert older metadata before newer
           val.getMetaData().putAll(meta);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Sep  3 06:37:24 2007
@@ -40,14 +40,16 @@
   private ArrayList<CrawlDatum> linked = new ArrayList<CrawlDatum>();
   private ScoringFilters scfilters = null;
   private boolean additionsAllowed;
-  private float maxInterval;
+  private int maxInterval;
   private FetchSchedule schedule;
 
   public void configure(JobConf job) {
     retryMax = job.getInt("db.fetch.retry.max", 3);
     scfilters = new ScoringFilters(job);
     additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
-    maxInterval = (float)(job.getInt("db.max.fetch.interval", 30) * 3600 * 24);
+    int oldMaxInterval = job.getInt("db.max.fetch.interval", 0);
+    maxInterval = job.getInt("db.fetch.interval.max", 0 );
+    if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
     schedule = FetchScheduleFactory.getFetchSchedule(job);
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Sep  3 06:37:24 2007
@@ -32,7 +32,7 @@
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
-    datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d));
+    datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
     datum.setModifiedTime(modifiedTime);
     return datum;
   }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Sep  3 06:37:24 2007
@@ -35,7 +35,7 @@
   /** Page is known to remain unmodified since our last visit. */
   public static final int STATUS_NOTMODIFIED    = 2;
   
-  public static final float SECONDS_PER_DAY = 3600.0f * 24.0f;
+  public static final int SECONDS_PER_DAY = 3600 * 24;
   /**
    * Initialize fetch schedule related data. Implementations should at least
    * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
@@ -111,6 +111,12 @@
   public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime);
   
+  /**
+   * Calculates last fetch time of the given CrawlDatum.
+   * @return the date as a long.
+   */
+  public long calculateLastFetchTime(CrawlDatum datum);
+
   /**
    * This method provides information whether the page is suitable for
    * selection in the current fetchlist. NOTE: a true return value does not

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Mon Sep  3 06:37:24 2007
@@ -46,7 +46,7 @@
   /** Normalize and filter injected urls. */
   public static class InjectMapper implements Mapper {
     private URLNormalizers urlNormalizers;
-    private float interval;
+    private int interval;
     private float scoreInjected;
     private JobConf jobConf;
     private URLFilters filters;
@@ -57,7 +57,7 @@
     public void configure(JobConf job) {
       this.jobConf = job;
       urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
-      interval = jobConf.getFloat("db.fetch.interval.default", 2592000.0f);
+      interval = jobConf.getInt("db.fetch.interval.default", 2592000);
       filters = new URLFilters(jobConf);
       scfilters = new ScoringFilters(jobConf);
       scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Sep  3 06:37:24 2007
@@ -82,7 +82,7 @@
     this.filters = new URLFilters(job);
     this.scfilters = new ScoringFilters(job);
     final UrlValidator validator = UrlValidator.get();
-    final float interval = job.getFloat("db.fetch.interval.default", 2592000.0f);
+    final int interval = job.getInt("db.fetch.interval.default", 2592000);
     final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
     final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100);
     final CompressionType compType = SequenceFile.getCompressionType(job);
@@ -125,7 +125,7 @@
             byte[] signature = StringUtil.fromHexString(sig);
             if (signature != null) {
               // append a CrawlDatum with a signature
-              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
+              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
               d.setSignature(signature);
               crawlOut.append(key, d);
             }

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Sep  3 06:37:24 2007
@@ -341,7 +341,7 @@
    * @return Constructed object
    */
   private URLCrawlDatum createURLCrawlDatum(final String url,
-      final float fetchInterval, final float score) {
+      final int fetchInterval, final float score) {
     return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
         CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
   }

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java?rev=572335&r1=572334&r2=572335&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Mon Sep  3 06:37:24 2007
@@ -84,7 +84,7 @@
     assertEquals(100, datum2.size());
     testWritable(datum2);
 
-    CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f);
+    CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1);
     c.setMetaData(new MapWritable());
     for (int i = 0; i < 100; i++) {
       c.getMetaData().put(new LongWritable(i), new Text("" + 1));