You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/01/16 00:07:16 UTC
svn commit: r496535 -
/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Author: ab
Date: Mon Jan 15 15:07:15 2007
New Revision: 496535
URL: http://svn.apache.org/viewvc?view=rev&rev=496535
Log:
Pick the right entry, as indicated by the same generate time.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=496535&r1=496534&r2=496535
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Mon Jan 15 15:07:15 2007
@@ -299,6 +299,12 @@
* Update the CrawlDB so that the next generate won't include the same URLs.
*/
public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
+ long generateTime;
+
+ public void configure(JobConf job) {
+ generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
+ }
+
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
if (key instanceof FloatWritable) { // tempDir source
SelectorEntry se = (SelectorEntry)value;
@@ -315,6 +321,11 @@
CrawlDatum val = (CrawlDatum)values.next();
if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
+ if (genTime.get() != generateTime) {
+ orig = val;
+ genTime = null;
+ continue;
+ }
} else {
orig = val;
}
@@ -384,7 +395,8 @@
}
job.setLong(CRAWL_GEN_CUR_TIME, curTime);
// record real generation time
- job.setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
+ long generateTime = System.currentTimeMillis();
+ job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
job.setLong(CRAWL_TOP_N, topN);
job.setBoolean(CRAWL_GENERATE_FILTER, filter);
@@ -453,6 +465,7 @@
job = new NutchJob(getConf());
job.setJobName("generate: updatedb " + dbDir);
+ job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
job.addInputPath(tempDir);
job.addInputPath(new Path(dbDir, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
@@ -492,7 +505,7 @@
}
/**
- * Generate a fetchlist from the pagedb and linkdb
+ * Generate a fetchlist from the crawldb.
*/
public static void main(String args[]) throws Exception {
int res = new Generator().doMain(NutchConfiguration.create(), args);