You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/01/13 15:31:23 UTC

svn commit: r1231090 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/Generator.java

Author: markus
Date: Fri Jan 13 14:31:22 2012
New Revision: 1231090

URL: http://svn.apache.org/viewvc?rev=1231090&view=rev
Log:
NUTCH-1177 Generator to select on retry interval

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 13 14:31:22 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1177 Generator to select on retry interval (markus)
+
 * NUTCH-1246 Upgrade to Hadoop 1.0.0 (jnioche)
 
 * NUTCH-1139 Indexer to delete gone documents (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jan 13 14:31:22 2012
@@ -587,7 +587,22 @@
   fetchlist.  -1 if unlimited.</description>
 </property>
 
+<property>
+  <name>generate.min.score</name>
+  <value>0</value>
+  <description>Select only entries with a score larger than
+  generate.min.score.</description>
+</property>
+
+<property>
+  <name>generate.min.interval</name>
+  <value>-1</value>
+  <description>Select only entries with a retry interval lower than
+  generate.min.interval. A value of -1 disables this check.</description>
+</property>
+
 <!-- urlpartitioner properties -->
+
 <property>
   <name>partition.url.mode</name>
   <value>byHost</value>

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1231090&r1=1231089&r2=1231090&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Jan 13 14:31:22 2012
@@ -61,6 +61,7 @@ public class Generator extends Configure
 
   public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
   public static final String GENERATOR_MIN_SCORE = "generate.min.score";
+  public static final String GENERATOR_MIN_INTERVAL = "generate.min.interval";
   public static final String GENERATOR_FILTER = "generate.filter";
   public static final String GENERATOR_NORMALISE = "generate.normalise";
   public static final String GENERATOR_MAX_COUNT = "generate.max.count";
@@ -129,6 +130,7 @@ public class Generator extends Configure
     private long genDelay;
     private FetchSchedule schedule;
     private float scoreThreshold = 0f;
+    private int intervalThreshold = -1;
     private int maxNumSegments = 1;
     int currentsegmentnum = 1;
 
@@ -155,6 +157,7 @@ public class Generator extends Configure
       if (time > 0) genTime.set(time);
       schedule = FetchScheduleFactory.getFetchSchedule(job);
       scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
+      intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
       maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
       segCounts = new int[maxNumSegments];
     }
@@ -205,6 +208,9 @@ public class Generator extends Configure
       // consider only entries with a score superior to the threshold
       if (scoreThreshold != Float.NaN && sort < scoreThreshold) return;
 
+      // consider only entries with a retry (or fetch) interval lower than threshold
+      if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return;
+
       // sort by decreasing score, using DecreasingFloatComparator
       sortValue.set(sort);
       // record generation time