You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/04/11 21:18:54 UTC

svn commit: r393297 - /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Author: ab
Date: Tue Apr 11 12:18:52 2006
New Revision: 393297

URL: http://svn.apache.org/viewcvs?rev=393297&view=rev
Log:
Re-instate support for content-level redirects.

Use the correct key, when FetcherThread.output() is called
after redirects. It should use the new URL as the key,
and not the original one.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=393297&r1=393296&r2=393297&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Apr 11 12:18:52 2006
@@ -80,6 +80,7 @@
     private Configuration conf;
     private URLFilters urlFilters;
     private ParseUtil parseUtil;
+    private UrlNormalizer normalizer;
     private ProtocolFactory protocolFactory;
 
     public FetcherThread(Configuration conf) {
@@ -89,6 +90,7 @@
       this.urlFilters = new URLFilters(conf);
       this.parseUtil = new ParseUtil(conf);
       this.protocolFactory = new ProtocolFactory(conf);
+      this.normalizer = new UrlNormalizerFactory(conf).getNormalizer();
     }
 
     public void run() {
@@ -117,7 +119,8 @@
           }
 
           // url may be changed through redirects.
-          String url = key.toString();
+          UTF8 url = new UTF8();
+          url.set(key);
           try {
             LOG.info("fetching " + url);            // fetch the page
             
@@ -126,30 +129,47 @@
             do {
               redirecting = false;
               LOG.fine("redirectCount=" + redirectCount);
-              Protocol protocol = this.protocolFactory.getProtocol(url);
-              ProtocolOutput output = protocol.getProtocolOutput(new UTF8(url), datum);
+              Protocol protocol = this.protocolFactory.getProtocol(url.toString());
+              ProtocolOutput output = protocol.getProtocolOutput(url, datum);
               ProtocolStatus status = output.getStatus();
               Content content = output.getContent();
+              ParseStatus pstatus = null;
 
               switch(status.getCode()) {
 
               case ProtocolStatus.SUCCESS:        // got a page
-                output(key, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
+                pstatus = output(url, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
                 updateStatus(content.getContent().length);
+                if (pstatus != null && pstatus.isSuccess() &&
+                        pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+                  String newUrl = pstatus.getMessage();
+                  newUrl = normalizer.normalize(newUrl);
+                  newUrl = this.urlFilters.filter(newUrl);
+                  if (newUrl != null && !newUrl.equals(url.toString())) {
+                    url = new UTF8(newUrl);
+                    redirecting = true;
+                    redirectCount++;
+                    LOG.fine(" - content redirect to " + url);
+                  } else {
+                    LOG.fine(" - content redirect skipped: " +
+                             (url.equals(newUrl.toString()) ? "to same url" : "filtered"));
+                  }
+                }
                 break;
 
               case ProtocolStatus.MOVED:         // redirect
               case ProtocolStatus.TEMP_MOVED:
                 String newUrl = status.getMessage();
+                newUrl = normalizer.normalize(newUrl);
                 newUrl = this.urlFilters.filter(newUrl);
-                if (newUrl != null && !newUrl.equals(url)) {
-                  url = newUrl;
+                if (newUrl != null && !newUrl.equals(url.toString())) {
+                  url = new UTF8(newUrl);
                   redirecting = true;
                   redirectCount++;
                   LOG.fine(" - protocol redirect to " + url);
                 } else {
                   LOG.fine(" - protocol redirect skipped: " +
-                           (url.equals(newUrl) ? "to same url" : "filtered"));
+                           (url.equals(newUrl.toString()) ? "to same url" : "filtered"));
                 }
                 break;
 
@@ -157,7 +177,7 @@
                 logError(url, status.getMessage());
               case ProtocolStatus.RETRY:          // retry
                 datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
-                output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
+                output(url, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
                 break;
                 
               case ProtocolStatus.GONE:           // gone
@@ -165,17 +185,17 @@
               case ProtocolStatus.ACCESS_DENIED:
               case ProtocolStatus.ROBOTS_DENIED:
               case ProtocolStatus.NOTMODIFIED:
-                output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+                output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
                 break;
 
               default:
                 LOG.warning("Unknown ProtocolStatus: " + status.getCode());
-                output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+                output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
               }
 
               if (redirecting && redirectCount >= maxRedirect) {
                 LOG.info(" - redirect count exceeded " + url);
-                output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+                output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
               }
 
             } while (redirecting && (redirectCount < maxRedirect));
@@ -183,7 +203,7 @@
             
           } catch (Throwable t) {                 // unexpected exception
             logError(url, t.toString());
-            output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+            output(url, datum, null, CrawlDatum.STATUS_FETCH_GONE);
             
           }
         }
@@ -196,14 +216,14 @@
       }
     }
 
-    private void logError(String url, String message) {
+    private void logError(UTF8 url, String message) {
       LOG.info("fetch of " + url + " failed with: " + message);
       synchronized (Fetcher.this) {               // record failure
         errors++;
       }
     }
 
-    private void output(UTF8 key, CrawlDatum datum,
+    private ParseStatus output(UTF8 key, CrawlDatum datum,
                         Content content, int status) {
 
       datum.setStatus(status);
@@ -252,6 +272,8 @@
         e.printStackTrace();
         LOG.severe("fetcher caught:"+e.toString());
       }
+      if (parse != null) return parse.getData().getStatus();
+      else return null;
     }
     
   }