You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/04/30 08:39:34 UTC
[nutch] branch master updated: NUTCH-2776 Fetcher to temporarily
deduplicate followed redirects - cache followed redirect targets for a
configurable time (`fetcher.redirect.dedupcache.seconds`) - if a redirect
target is found in cache it's skipped
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 0f33d18 NUTCH-2776 Fetcher to temporarily deduplicate followed redirects - cache followed redirect targets for a configurable time (`fetcher.redirect.dedupcache.seconds`) - if a redirect target is found in cache it's skipped
new a8162b9 Merge pull request #505 from sebastian-nagel/NUTCH-2776-fetcher-dedup-redirects
0f33d18 is described below
commit 0f33d183c80e3f75f39d8ebe0dff163436b6d710
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Fri Mar 20 19:55:37 2020 +0100
NUTCH-2776 Fetcher to temporarily deduplicate followed redirects
- cache followed redirect targets for a configurable time
(`fetcher.redirect.dedupcache.seconds`)
- if a redirect target is found in cache it's skipped
---
conf/nutch-default.xml | 24 ++++++++++++++
.../org/apache/nutch/fetcher/FetchItemQueues.java | 37 ++++++++++++++++++++--
.../org/apache/nutch/fetcher/FetcherThread.java | 9 ++++++
3 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 85d9933..b7e7ee9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1209,6 +1209,30 @@
</description>
</property>
+<property>
+ <name>fetcher.redirect.dedupcache.seconds</name>
+ <value>-1</value>
+ <description>
+ The maximum time in seconds fetcher will cache redirects for
+ deduplication. If the same redirect URL is seen again withing
+ this time it is skipped. This allows to avoid pathological cases
+ where many or most of the URLs of a host are redirected to the
+ same URL, eg. a page to login, accept cookies, indicating an
+ error. A value less or equal zero disables redirect deduplication.
+ Caveat: This may break setting cookies via recursive redirect chains.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.redirect.dedupcache.size</name>
+ <value>1000</value>
+ <description>
+ The maximum size of the cache to deduplicate redirects,
+ see `fetcher.redirect.dedupcache.seconds`.
+ </description>
+</property>
+
+
<!-- SegmentReader -->
<property>
<name>segment.reader.content.recode</name>
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index 3c1003e..ce7b2b6 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -22,6 +22,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
@@ -30,9 +31,13 @@ import org.apache.nutch.crawl.CrawlDatum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.base.Optional;
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+
/**
- * Convenience class - a collection of queues that keeps track of the total
- * number of items, and provides items eligible for fetching from any queue.
+ * A collection of queues that keeps track of the total number of items, and
+ * provides items eligible for fetching from any queue.
*/
public class FetchItemQueues {
@@ -44,6 +49,8 @@ public class FetchItemQueues {
private Set<String> queuesMaxExceptions = new HashSet<>();
Iterator<Map.Entry<String, FetchItemQueue>> lastIterator = null;
AtomicInteger totalSize = new AtomicInteger(0);
+ Cache<Text, Optional<String>> redirectDedupCache = null;
+
int maxThreads;
long crawlDelay;
long minCrawlDelay;
@@ -77,6 +84,16 @@ public class FetchItemQueues {
this.timelimit = conf.getLong("fetcher.timelimit", -1);
this.maxExceptionsPerQueue = conf.getInt(
"fetcher.max.exceptions.per.queue", -1);
+
+ int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
+ -1);
+ int dedupRedirMaxSize = conf.getInt("fetcher.redirect.dedupcache.size",
+ 1000);
+ if (dedupRedirMaxTime > 0 && dedupRedirMaxSize > 0) {
+ redirectDedupCache = CacheBuilder.newBuilder()
+ .maximumSize(dedupRedirMaxSize)
+ .expireAfterWrite(dedupRedirMaxTime, TimeUnit.SECONDS).build();
+ }
}
/**
@@ -246,6 +263,22 @@ public class FetchItemQueues {
return 0;
}
+ /**
+ * @param redirUrl
+ * redirect target
+ * @return true if redirects are deduplicated and redirUrl has been queued
+ * recently
+ */
+ public boolean redirectIsQueuedRecently(Text redirUrl) {
+ if (redirectDedupCache != null) {
+ if (redirectDedupCache.getIfPresent(redirUrl) != null) {
+ return true;
+ }
+ redirectDedupCache.put(redirUrl, Optional.absent());
+ }
+ return false;
+ }
+
public synchronized void dump() {
for (String id : queues.keySet()) {
FetchItemQueue fiq = queues.get(id);
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 5d5a20b..1a23300 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -450,6 +450,8 @@ public class FetcherThread extends Thread {
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
+ context.getCounter("FetcherStatus", "redirect_count_exceeded")
+ .increment(1);
if (LOG.isInfoEnabled()) {
LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
Thread.currentThread().getId(), fit.url,
@@ -585,6 +587,13 @@ public class FetcherThread extends Thread {
private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
throws ScoringFilterException {
+ if (fetchQueues.redirectIsQueuedRecently(redirUrl)) {
+ redirecting = false;
+ context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1);
+ LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
+ redirUrl);
+ return null;
+ }
CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {