You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2013/06/13 14:10:38 UTC
svn commit: r1492639 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
src/java/org/apache/nutch/tools/FreeGenerator.java
Author: markus
Date: Thu Jun 13 12:10:37 2013
New Revision: 1492639
URL: http://svn.apache.org/r1492639
Log:
NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1492639&r1=1492638&r2=1492639&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 13 12:10:37 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule (markus)
+
* NUTCH-1522 Upgrade to Tika 1.3 (jnioche)
* NUTCH-1578 Upgrade to Hadoop 1.2.0 (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1492639&r1=1492638&r2=1492639&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Thu Jun 13 12:10:37 2013
@@ -94,6 +94,9 @@ public class AdaptiveFetchSchedule exten
float interval = datum.getFetchInterval();
long refTime = fetchTime;
+ // https://issues.apache.org/jira/browse/NUTCH-1430
+ interval = (interval == 0) ? defaultInterval : interval;
+
if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
// Is fetch interval preset in CrawlDatum MD? Then use preset interval
FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=1492639&r1=1492638&r2=1492639&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu Jun 13 12:10:37 2013
@@ -72,10 +72,12 @@ public class FreeGenerator extends Confi
private ScoringFilters scfilters;
private CrawlDatum datum = new CrawlDatum();
private Text url = new Text();
+ private int defaultInterval = 0;
@Override
public void configure(JobConf job) {
super.configure(job);
+ defaultInterval = job.getInt("db.fetch.interval.default", 0);
scfilters = new ScoringFilters(job);
if (job.getBoolean(FILTER_KEY, false)) {
filters = new URLFilters(job);
@@ -84,7 +86,7 @@ public class FreeGenerator extends Confi
normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
}
}
-
+
Generator.SelectorEntry entry = new Generator.SelectorEntry();
public void map(WritableComparable<?> key, Text value, OutputCollector<Text,
@@ -114,6 +116,8 @@ public class FreeGenerator extends Confi
}
entry.datum = datum;
entry.url = url;
+ // https://issues.apache.org/jira/browse/NUTCH-1430
+ entry.datum.setFetchInterval(defaultInterval);
output.collect(url, entry);
}
@@ -131,7 +135,7 @@ public class FreeGenerator extends Confi
}
}
}
-
+
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
@@ -156,7 +160,7 @@ public class FreeGenerator extends Confi
}
}
}
-
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("FreeGenerator: starting at " + sdf.format(start));