You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/11/10 22:03:17 UTC

svn commit: r332371 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Author: cutting
Date: Thu Nov 10 13:03:16 2005
New Revision: 332371

URL: http://svn.apache.org/viewcvs?rev=332371&view=rev
Log:
Fix to not increment count of urls when urls are filtered by
maxPerHost limit.  Patch contributed by Rod Taylor.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371&r1=332370&r2=332371&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Nov 10 13:03:16 2005
@@ -76,23 +76,27 @@
                        OutputCollector output, Reporter reporter)
       throws IOException {
 
-      while (values.hasNext() && ++count < limit) {
+      while (values.hasNext() && count < limit) {
 
         UTF8 url = (UTF8)values.next();
 
-        if (maxPerHost > 0) {                       // are we counting hosts?
+        if (maxPerHost > 0) {                     // are we counting hosts?
           String host = new URL(url.toString()).getHost();
-          Integer count = (Integer)hostCounts.get(host);
-          if (count != null) {
-            if (count.intValue() >= maxPerHost)
+          Integer hostCount = (Integer)hostCounts.get(host);
+          if (hostCount != null) {
+            if (hostCount.intValue() >= maxPerHost)
               continue;                           // too many from host
-            hostCounts.put(host, new Integer(count.intValue()+1));
+            hostCounts.put(host, new Integer(hostCount.intValue()+1));
           } else {                                // update host count
             hostCounts.put(host, new Integer(1));
           }
         }
 
         output.collect(key, url);
+
+        // Count is incremented only when we keep the URL
+        // maxPerHost may cause us to skip it.
+        count++;
       }
 
     }