You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/04/05 19:06:04 UTC

svn commit: r1585144 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

Author: snagel
Date: Sat Apr  5 17:06:04 2014
New Revision: 1585144

URL: http://svn.apache.org/r1585144
Log:
NUTCH-1735 code dedup fetcher queue redirects

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1585144&r1=1585143&r2=1585144&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr  5 17:06:04 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1735 code dedup fetcher queue redirects (snagel)
+
 * NUTCH-1745 Upgrade to ElasticSearch 1.1.0 (jnioche)
 
 * NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1585144&r1=1585143&r2=1585144&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Apr  5 17:06:04 2014
@@ -731,25 +731,7 @@ public class Fetcher extends Configured 
                                    refreshTime < Fetcher.PERM_REFRESH_TIME,
                                    Fetcher.CONTENT_REDIR);
                   if (redirUrl != null) {
-                    CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
-                        fit.datum.getFetchInterval(), fit.datum.getScore());
-                    // transfer existing metadata to the redir
-                    newDatum.getMetaData().putAll(fit.datum.getMetaData());
-                    scfilters.initialScore(redirUrl, newDatum);
-                    if (reprUrl != null) {
-                      newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-                          new Text(reprUrl));
-                    }
-                    fit = FetchItem.create(redirUrl, newDatum, queueMode);
-                    if (fit != null) {
-                      FetchItemQueue fiq =
-                        fetchQueues.getFetchItemQueue(fit.queueID);
-                      fiq.addInProgressFetchItem(fit);
-                    } else {
-                      // stop redirecting
-                      redirecting = false;
-                      reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
-                    }
+                    queueRedirect(redirUrl, fit);
                   }
                 }
                 break;
@@ -772,25 +754,7 @@ public class Fetcher extends Configured 
                                  urlString, newUrl, temp,
                                  Fetcher.PROTOCOL_REDIR);
                 if (redirUrl != null) {
-                  CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
-                      fit.datum.getFetchInterval(), fit.datum.getScore());
-                  // transfer existing metadata
-                  newDatum.getMetaData().putAll(fit.datum.getMetaData());
-                  scfilters.initialScore(redirUrl, newDatum);
-                  if (reprUrl != null) {
-                    newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-                        new Text(reprUrl));
-                  }
-                  fit = FetchItem.create(redirUrl, newDatum, queueMode);
-                  if (fit != null) {
-                    FetchItemQueue fiq =
-                      fetchQueues.getFetchItemQueue(fit.queueID);
-                    fiq.addInProgressFetchItem(fit);
-                  } else {
-                    // stop redirecting
-                    redirecting = false;
-                    reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
-                  }
+                  queueRedirect(redirUrl, fit);
                 } else {
                   // stop redirecting
                   redirecting = false;
@@ -918,6 +882,28 @@ public class Fetcher extends Configured 
       }
     }
 
+    private void queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
+      CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+          fit.datum.getFetchInterval(), fit.datum.getScore());
+      // transfer all existing metadata to the redirect
+      newDatum.getMetaData().putAll(fit.datum.getMetaData());
+      scfilters.initialScore(redirUrl, newDatum);
+      if (reprUrl != null) {
+        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+            new Text(reprUrl));
+      }
+      fit = FetchItem.create(redirUrl, newDatum, queueMode);
+      if (fit != null) {
+        FetchItemQueue fiq =
+          fetchQueues.getFetchItemQueue(fit.queueID);
+        fiq.addInProgressFetchItem(fit);
+      } else {
+        // stop redirecting
+        redirecting = false;
+        reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
+      }
+    }
+
     private void logError(Text url, String message) {
       if (LOG.isInfoEnabled()) {
         LOG.info("fetch of " + url + " failed with: " + message);