You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/17 13:33:58 UTC

svn commit: r637858 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/fetcher/Fetcher2.java src/java/org/apache/nutch/parse/ParseOutputFormat.java

Author: ab
Date: Mon Mar 17 05:33:56 2008
New Revision: 637858

URL: http://svn.apache.org/viewvc?rev=637858&view=rev
Log:
NUTCH-615 Redirected URL-s fetched without setting fetchInterval. Guard against
reprUrl being null.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Mar 17 05:33:56 2008
@@ -229,6 +229,9 @@
 
 83. NUTCH-126 - Fetching https does not work with a proxy (Fritz Elfert via ab)
 
+84. NUTCH-615 - Redirected URL-s fetched without setting fetchInterval.
+    Guard against reprUrl being null. (Emmanuel Joke, ab)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Mon Mar 17 05:33:56 2008
@@ -282,8 +282,10 @@
           return url;
         } else {
           CrawlDatum newDatum = new CrawlDatum();
-          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-              new Text(reprUrl));
+          if (reprUrl != null) {
+            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                new Text(reprUrl));
+          }
           output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
           if (LOG.isDebugEnabled()) {
             LOG.debug(" - " + redirType + " redirect to " +

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Mon Mar 17 05:33:56 2008
@@ -549,9 +549,12 @@
                                    refreshTime < Fetcher.PERM_REFRESH_TIME,
                                    Fetcher.CONTENT_REDIR);
                   if (redirUrl != null) {
-                    CrawlDatum newDatum = new CrawlDatum();
-                    newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-                        new Text(reprUrl));
+                    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+                        fit.datum.getFetchInterval(), fit.datum.getScore());
+                    if (reprUrl != null) {
+                      newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                          new Text(reprUrl));
+                    }
                     fit = FetchItem.create(redirUrl, newDatum, byIP);
                     if (fit != null) {
                       FetchItemQueue fiq =
@@ -582,14 +585,22 @@
                   handleRedirect(fit.url, fit.datum,
                                  urlString, newUrl, temp,
                                  Fetcher.PROTOCOL_REDIR);
-                CrawlDatum newDatum = new CrawlDatum();
-                newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-                    new Text(reprUrl));
-                fit = FetchItem.create(redirUrl, newDatum, byIP);
-                if (fit != null) {
-                  FetchItemQueue fiq =
-                    fetchQueues.getFetchItemQueue(fit.queueID);
-                  fiq.addInProgressFetchItem(fit);
+                if (redirUrl != null) {
+                  CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+                      fit.datum.getFetchInterval(), fit.datum.getScore());
+                  if (reprUrl != null) {
+                    newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                        new Text(reprUrl));
+                  }
+                  fit = FetchItem.create(redirUrl, newDatum, byIP);
+                  if (fit != null) {
+                    FetchItemQueue fiq =
+                      fetchQueues.getFetchItemQueue(fit.queueID);
+                    fiq.addInProgressFetchItem(fit);
+                  } else {
+                    // stop redirecting
+                    redirecting = false;
+                  }
                 } else {
                   // stop redirecting
                   redirecting = false;
@@ -622,7 +633,7 @@
                 if (LOG.isWarnEnabled()) {
                   LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                 }
-                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
+                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
               }
 
               if (redirecting && redirectCount >= maxRedirect) {
@@ -674,8 +685,10 @@
           return url;
         } else {
           CrawlDatum newDatum = new CrawlDatum();
-          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-              new Text(reprUrl));
+          if (reprUrl != null) {
+            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                new Text(reprUrl));
+          }
           output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
           if (LOG.isDebugEnabled()) {
             LOG.debug(" - " + redirType + " redirect to " +

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=637858&r1=637857&r2=637858&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Mar 17 05:33:56 2008
@@ -152,7 +152,7 @@
                                      refreshTime < Fetcher.PERM_REFRESH_TIME);
                 CrawlDatum newDatum = new CrawlDatum();
                 newDatum.setStatus(CrawlDatum.STATUS_LINKED);
-                if (!reprUrl.equals(newUrl)) {
+                if (reprUrl != null && !reprUrl.equals(newUrl)) {
                   newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
                                              new Text(reprUrl));
                 }