You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/06/16 12:33:25 UTC

svn commit: r547901 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher2.java

Author: dogacan
Date: Sat Jun 16 03:33:24 2007
New Revision: 547901

URL: http://svn.apache.org/viewvc?view=rev&rev=547901
Log:
NUTCH-495 - Unnecessary delays in Fetcher2.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=547901&r1=547900&r2=547901
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Jun 16 03:33:24 2007
@@ -30,7 +30,7 @@
 10. NUTCH-392 - OutputFormat implementations should pass on Progressable.
     (cutting via ab)
 
-  
+11. NUTCH-495 - Unnecessary delays in Fetcher2 (dogacan)
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=547901&r1=547900&r2=547901
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sat Jun 16 03:33:24 2007
@@ -195,7 +195,7 @@
   private static class FetchItemQueue {
     List<FetchItem> queue = Collections.synchronizedList(new LinkedList<FetchItem>());
     Set<FetchItem>  inProgress = Collections.synchronizedSet(new HashSet<FetchItem>());
-    AtomicLong endTime = new AtomicLong();
+    AtomicLong nextFetchTime = new AtomicLong();
     long crawlDelay;
     long minCrawlDelay;
     int maxThreads;
@@ -207,7 +207,7 @@
       this.crawlDelay = crawlDelay;
       this.minCrawlDelay = minCrawlDelay;
       // ready to start
-      this.endTime.set(System.currentTimeMillis() - crawlDelay);
+      setEndTime(System.currentTimeMillis() - crawlDelay);
     }
     
     public int getQueueSize() {
@@ -218,10 +218,10 @@
       return inProgress.size();
     }
     
-    public void finishFetchItem(FetchItem it) {
+    public void finishFetchItem(FetchItem it, boolean asap) {
       if (it != null) {
         inProgress.remove(it);
-        endTime.set(System.currentTimeMillis());
+        setEndTime(System.currentTimeMillis(), asap);
       }
     }
     
@@ -238,8 +238,7 @@
     public FetchItem getFetchItem() {
       if (inProgress.size() >= maxThreads) return null;
       long now = System.currentTimeMillis();
-      long last = endTime.get() + (maxThreads > 1 ? minCrawlDelay : crawlDelay);
-      if (last > now) return null;
+      if (nextFetchTime.get() > now) return null;
       FetchItem it = null;
       if (queue.size() == 0) return null;
       try {
@@ -256,13 +255,24 @@
       LOG.info("  inProgress    = " + inProgress.size());
       LOG.info("  crawlDelay    = " + crawlDelay);
       LOG.info("  minCrawlDelay = " + minCrawlDelay);
-      LOG.info("  endTime       = " + endTime.get());
+      LOG.info("  nextFetchTime = " + nextFetchTime.get());
       LOG.info("  now           = " + System.currentTimeMillis());
       for (int i = 0; i < queue.size(); i++) {
         FetchItem it = queue.get(i);
         LOG.info("  " + i + ". " + it.url);
       }
     }
+    
+    private void setEndTime(long endTime) {
+      setEndTime(endTime, false);
+    }
+    
+    private void setEndTime(long endTime, boolean asap) {
+      if (!asap)
+        nextFetchTime.set(endTime + (maxThreads > 1 ? minCrawlDelay : crawlDelay));
+      else
+        nextFetchTime.set(endTime);
+    }
   }
   
   /**
@@ -308,12 +318,16 @@
     }
     
     public void finishFetchItem(FetchItem it) {
+      finishFetchItem(it, false);
+    }
+    
+    public void finishFetchItem(FetchItem it, boolean asap) {
       FetchItemQueue fiq = queues.get(it.queueID);
       if (fiq == null) {
         LOG.warn("Attempting to finish item from unknown queue: " + it);
         return;
       }
-      fiq.finishFetchItem(it);
+      fiq.finishFetchItem(it, asap);
     }
     
     public synchronized FetchItemQueue getFetchItemQueue(String id) {
@@ -474,7 +488,7 @@
               RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);
               if (!rules.isAllowed(fit.u)) {
                 // unblock
-                fetchQueues.finishFetchItem(fit);
+                fetchQueues.finishFetchItem(fit, true);
                 if (LOG.isDebugEnabled()) {
                   LOG.debug("Denied by robots.txt: " + fit.url);
                 }
@@ -484,7 +498,7 @@
               if (rules.getCrawlDelay() > 0) {
                 if (rules.getCrawlDelay() > maxCrawlDelay) {
                   // unblock
-                  fetchQueues.finishFetchItem(fit);
+                  fetchQueues.finishFetchItem(fit, true);
                   LOG.debug("Crawl-Delay for " + fit.url + " too long (" + rules.getCrawlDelay() + "), skipping");
                   output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                   continue;
@@ -503,8 +517,6 @@
               switch(status.getCode()) {
                 
               case ProtocolStatus.WOULDBLOCK:
-                // unblock
-                fetchQueues.finishFetchItem(fit);
                 // retry ?
                 fetchQueues.addFetchItem(fit);
                 break;