You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/11/10 22:03:17 UTC
svn commit: r332371 -
/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Author: cutting
Date: Thu Nov 10 13:03:16 2005
New Revision: 332371
URL: http://svn.apache.org/viewcvs?rev=332371&view=rev
Log:
Fix to not increment count of urls when urls are filtered by
maxPerHost limit. Patch contributed by Rod Taylor.
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371&r1=332370&r2=332371&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Nov 10 13:03:16 2005
@@ -76,23 +76,27 @@
OutputCollector output, Reporter reporter)
throws IOException {
- while (values.hasNext() && ++count < limit) {
+ while (values.hasNext() && count < limit) {
UTF8 url = (UTF8)values.next();
- if (maxPerHost > 0) { // are we counting hosts?
+ if (maxPerHost > 0) { // are we counting hosts?
String host = new URL(url.toString()).getHost();
- Integer count = (Integer)hostCounts.get(host);
- if (count != null) {
- if (count.intValue() >= maxPerHost)
+ Integer hostCount = (Integer)hostCounts.get(host);
+ if (hostCount != null) {
+ if (hostCount.intValue() >= maxPerHost)
continue; // too many from host
- hostCounts.put(host, new Integer(count.intValue()+1));
+ hostCounts.put(host, new Integer(hostCount.intValue()+1));
} else { // update host count
hostCounts.put(host, new Integer(1));
}
}
output.collect(key, url);
+
+ // Count is incremented only when we keep the URL
+ // maxPerHost may cause us to skip it.
+ count++;
}
}