You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/18 23:13:57 UTC
svn commit: r219568 - in /lucene/nutch/branches/mapred:
conf/nutch-default.xml src/java/org/apache/nutch/crawl/Generator.java
Author: cutting
Date: Mon Jul 18 14:08:36 2005
New Revision: 219568
URL: http://svn.apache.org/viewcvs?rev=219568&view=rev
Log:
Add per-host url limit in generate.
Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=219568&r1=219567&r2=219568&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 18 14:08:36 2005
@@ -262,6 +262,15 @@
recoverable errors is generated for fetch.</description>
</property>
+<!-- generate properties -->
+
+<property>
+ <name>generate.max.per.host</name>
+ <value>-1</value>
+ <description>The maximum number of urls per host in a single
+ fetchlist. -1 if unlimited.</description>
+</property>
+
<!-- fetchlist tool properties -->
<property>
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=219568&r1=219567&r2=219568&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Mon Jul 18 14:08:36 2005
@@ -39,10 +39,13 @@
private long curTime;
private long limit;
private long count;
+ private HashMap hostCounts = new HashMap();
+ private int maxPerHost;
public void configure(JobConf job) {
curTime = job.getLong("crawl.gen.curTime", System.currentTimeMillis());
limit = job.getLong("crawl.gen.limit", Long.MAX_VALUE);
+ maxPerHost = job.getInt("generate.max.per.host", -1);
}
/** Select & invert subset due for fetch. */
@@ -55,6 +58,18 @@
if (crawlDatum.getFetchTime() > curTime)
return; // not time yet
+
+ if (maxPerHost > 0) { // are we counting hosts?
+ String host = new URL(((UTF8)key).toString()).getHost();
+ Integer count = (Integer)hostCounts.get(host);
+ if (count != null) {
+ if (count.intValue() >= maxPerHost)
+ return; // too many from host
+ hostCounts.put(host, new Integer(count.intValue()+1));
+ } else { // update host count
+ hostCounts.put(host, new Integer(1));
+ }
+ }
output.collect(crawlDatum, key); // invert for sort by linkCount
}