You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/18 23:13:57 UTC

svn commit: r219568 - in /lucene/nutch/branches/mapred: conf/nutch-default.xml src/java/org/apache/nutch/crawl/Generator.java

Author: cutting
Date: Mon Jul 18 14:08:36 2005
New Revision: 219568

URL: http://svn.apache.org/viewcvs?rev=219568&view=rev
Log:
Add per-host url limit in generate.

Modified:
    lucene/nutch/branches/mapred/conf/nutch-default.xml
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=219568&r1=219567&r2=219568&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 18 14:08:36 2005
@@ -262,6 +262,15 @@
   recoverable errors is generated for fetch.</description>
 </property>
 
+<!-- generate properties -->
+
+<property>
+  <name>generate.max.per.host</name>
+  <value>-1</value>
+  <description>The maximum number of urls per host in a single
+  fetchlist.  -1 if unlimited.</description>
+</property>
+
 <!-- fetchlist tool properties -->
 
 <property>

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=219568&r1=219567&r2=219568&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Mon Jul 18 14:08:36 2005
@@ -39,10 +39,13 @@
     private long curTime;
     private long limit;
     private long count;
+    private HashMap hostCounts = new HashMap();
+    private int maxPerHost;
 
     public void configure(JobConf job) {
       curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
       limit = job.getLong("crawl.gen.limit", Long.MAX_VALUE);
+      maxPerHost = job.getInt("generate.max.per.host", -1);
     }
 
     /** Select & invert subset due for fetch. */
@@ -55,6 +58,18 @@
 
       if (crawlDatum.getFetchTime() > curTime)
         return;                                   // not time yet
+
+      if (maxPerHost > 0) {                       // are we counting hosts?
+        String host = new URL(((UTF8)key).toString()).getHost();
+        Integer count = (Integer)hostCounts.get(host);
+        if (count != null) {
+          if (count.intValue() >= maxPerHost)
+            return;                               // too many from host
+          hostCounts.put(host, new Integer(count.intValue()+1));
+        } else {                                  // update host count
+          hostCounts.put(host, new Integer(1));
+        }
+      }
 
       output.collect(crawlDatum, key);          // invert for sort by linkCount
     }