You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/04/05 19:06:04 UTC
svn commit: r1585144 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/fetcher/Fetcher.java
Author: snagel
Date: Sat Apr 5 17:06:04 2014
New Revision: 1585144
URL: http://svn.apache.org/r1585144
Log:
NUTCH-1735 code dedup fetcher queue redirects
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1585144&r1=1585143&r2=1585144&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sat Apr 5 17:06:04 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1735 code dedup fetcher queue redirects (snagel)
+
* NUTCH-1745 Upgrade to ElasticSearch 1.1.0 (jnioche)
* NUTCH-1645 Junit Test Case for Adaptive Fetch Schedule class (Yasin Kılınç, lufeng, Sertac TURKEL via snagel)
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1585144&r1=1585143&r2=1585144&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sat Apr 5 17:06:04 2014
@@ -731,25 +731,7 @@ public class Fetcher extends Configured
refreshTime < Fetcher.PERM_REFRESH_TIME,
Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
- CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
- fit.datum.getFetchInterval(), fit.datum.getScore());
- // transfer existing metadata to the redir
- newDatum.getMetaData().putAll(fit.datum.getMetaData());
- scfilters.initialScore(redirUrl, newDatum);
- if (reprUrl != null) {
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
- }
- fit = FetchItem.create(redirUrl, newDatum, queueMode);
- if (fit != null) {
- FetchItemQueue fiq =
- fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.addInProgressFetchItem(fit);
- } else {
- // stop redirecting
- redirecting = false;
- reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
- }
+ queueRedirect(redirUrl, fit);
}
}
break;
@@ -772,25 +754,7 @@ public class Fetcher extends Configured
urlString, newUrl, temp,
Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
- CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
- fit.datum.getFetchInterval(), fit.datum.getScore());
- // transfer existing metadata
- newDatum.getMetaData().putAll(fit.datum.getMetaData());
- scfilters.initialScore(redirUrl, newDatum);
- if (reprUrl != null) {
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
- }
- fit = FetchItem.create(redirUrl, newDatum, queueMode);
- if (fit != null) {
- FetchItemQueue fiq =
- fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.addInProgressFetchItem(fit);
- } else {
- // stop redirecting
- redirecting = false;
- reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
- }
+ queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
@@ -918,6 +882,28 @@ public class Fetcher extends Configured
}
}
+ private void queueRedirect(Text redirUrl, FetchItem fit) throws ScoringFilterException {
+ CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
+ fit.datum.getFetchInterval(), fit.datum.getScore());
+ // transfer all existing metadata to the redirect
+ newDatum.getMetaData().putAll(fit.datum.getMetaData());
+ scfilters.initialScore(redirUrl, newDatum);
+ if (reprUrl != null) {
+ newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+ new Text(reprUrl));
+ }
+ fit = FetchItem.create(redirUrl, newDatum, queueMode);
+ if (fit != null) {
+ FetchItemQueue fiq =
+ fetchQueues.getFetchItemQueue(fit.queueID);
+ fiq.addInProgressFetchItem(fit);
+ } else {
+ // stop redirecting
+ redirecting = false;
+ reporter.incrCounter("FetcherStatus", "FetchItem.notCreated.redirect", 1);
+ }
+ }
+
private void logError(Text url, String message) {
if (LOG.isInfoEnabled()) {
LOG.info("fetch of " + url + " failed with: " + message);