You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/01/13 15:31:23 UTC
svn commit: r1231090 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/crawl/Generator.java
Author: markus
Date: Fri Jan 13 14:31:22 2012
New Revision: 1231090
URL: http://svn.apache.org/viewvc?rev=1231090&view=rev
Log:
NUTCH-1177 Generator to select on retry interval
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 13 14:31:22 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1177 Generator to select on retry interval (markus)
+
* NUTCH-1246 Upgrade to Hadoop 1.0.0 (jnioche)
* NUTCH-1139 Indexer to delete gone documents (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jan 13 14:31:22 2012
@@ -587,7 +587,22 @@
fetchlist. -1 if unlimited.</description>
</property>
+<property>
+ <name>generate.min.score</name>
+ <value>0</value>
+ <description>Select only entries with a score larger than
+ generate.min.score.</description>
+</property>
+
+<property>
+ <name>generate.min.interval</name>
+ <value>-1</value>
+ <description>Select only entries with a retry interval lower than
+ generate.min.interval. A value of -1 disables this check.</description>
+</property>
+
<!-- urlpartitioner properties -->
+
<property>
<name>partition.url.mode</name>
<value>byHost</value>
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 13 14:31:22 2012
@@ -61,6 +61,7 @@ public class Generator extends Configure
public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
public static final String GENERATOR_MIN_SCORE = "generate.min.score";
+ public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
public static final String GENERATOR_FILTER = "generate.filter";
public static final String GENERATOR_NORMALISE = "generate.normalise";
public static final String GENERATOR_MAX_COUNT = "generate.max.count";
@@ -129,6 +130,7 @@ public class Generator extends Configure
private long genDelay;
private FetchSchedule schedule;
private float scoreThreshold = 0f;
+ private int intervalThreshold = -1;
private int maxNumSegments = 1;
int currentsegmentnum = 1;
@@ -155,6 +157,7 @@ public class Generator extends Configure
if (time > 0) genTime.set(time);
schedule = FetchScheduleFactory.getFetchSchedule(job);
scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
+ intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
segCounts = new int[maxNumSegments];
}
@@ -205,6 +208,9 @@ public class Generator extends Configure
// consider only entries with a score superior to the threshold
if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;
+ // consider only entries with a retry (or fetch) interval lower than threshold
+ if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return;
+
// sort by decreasing score, using DecreasingFloatComparator
sortValue.set(sort);
// record generation time