You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/24 10:18:04 UTC
svn commit: r747312 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/fetcher/Fetcher2.java
Author: siren
Date: Tue Feb 24 09:18:03 2009
New Revision: 747312
URL: http://svn.apache.org/viewvc?rev=747312&view=rev
Log:
NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links set at cross domain redirects, contributed by Remco Verhoef, dogacan
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747312&r1=747311&r2=747312&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:18:03 2009
@@ -349,11 +349,14 @@
130. NUTCH-563 - Include custom fields in BasicQueryFilter
(Julien Nioche via siren)
-131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+131. NUTCH-695 - Incorrect mime type detection by MoreIndexingFilter plugin
(Dmitry Lihachev via siren)
132. NUTCH-694 - Distributed Search Server fails (siren)
+133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
+ set at cross domain redirects (Remco Verhoef, dogacan via siren)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747312&r1=747311&r2=747312&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:18:03 2009
@@ -94,7 +94,6 @@
throws IOException {
FileStatus[] files = listStatus(job);
FileSplit[] splits = new FileSplit[files.length];
- FileSystem fs = FileSystem.get(job);
for (int i = 0; i < files.length; i++) {
FileStatus cur = files[i];
splits[i] = new FileSplit(cur.getPath(), 0,
@@ -443,6 +442,7 @@
private String reprUrl;
private boolean redirecting;
private int redirectCount;
+ private boolean ignoreExternalLinks;
public FetcherThread(Configuration conf) {
this.setDaemon(true); // don't hang JVM on exit
@@ -457,6 +457,8 @@
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
this.maxRedirect = conf.getInt("http.redirect.max", 3);
+ this.ignoreExternalLinks =
+ conf.getBoolean("db.ignore.external.links", false);
}
public void run() {
@@ -673,6 +675,22 @@
throws MalformedURLException, URLFilterException {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
+
+ if (ignoreExternalLinks) {
+ try {
+ String origHost = new URL(urlString).getHost().toLowerCase();
+ String newHost = new URL(newUrl).getHost().toLowerCase();
+ if (!origHost.equals(newHost)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(" - ignoring redirect " + redirType + " from " +
+ urlString + " to " + newUrl +
+ " because external links are ignored");
+ }
+ return null;
+ }
+ } catch (MalformedURLException e) { }
+ }
+
if (newUrl != null && !newUrl.equals(urlString)) {
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
url = new Text(newUrl);