You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/24 10:18:04 UTC

svn commit: r747312 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher2.java

Author: siren
Date: Tue Feb 24 09:18:03 2009
New Revision: 747312

URL: http://svn.apache.org/viewvc?rev=747312&view=rev
Log:
NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links set at cross domain redirects, contributed by Remco Verhoef, dogacan

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747312&r1=747311&r2=747312&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:18:03 2009
@@ -349,11 +349,14 @@
 130. NUTCH-563 - Include custom fields in BasicQueryFilter
      (Julien Nioche via siren)
      
-131. NUTCH-695 - incorrect mime type detection by MoreIndexingFilter plugin
+131. NUTCH-695 - Incorrect mime type detection by MoreIndexingFilter plugin
      (Dmitry Lihachev via siren)
      
 132. NUTCH-694 - Distributed Search Server fails (siren)
 
+133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
+     set at cross domain redirects (Remco Verhoef, dogacan via siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747312&r1=747311&r2=747312&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:18:03 2009
@@ -94,7 +94,6 @@
       throws IOException {
       FileStatus[] files = listStatus(job);
       FileSplit[] splits = new FileSplit[files.length];
-      FileSystem fs = FileSystem.get(job);
       for (int i = 0; i < files.length; i++) {
         FileStatus cur = files[i];
         splits[i] = new FileSplit(cur.getPath(), 0,
@@ -443,6 +442,7 @@
     private String reprUrl;
     private boolean redirecting;
     private int redirectCount;
+    private boolean ignoreExternalLinks;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -457,6 +457,8 @@
       // backward-compatible default setting
       this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
       this.maxRedirect = conf.getInt("http.redirect.max", 3);
+      this.ignoreExternalLinks = 
+        conf.getBoolean("db.ignore.external.links", false);
     }
 
     public void run() {
@@ -673,6 +675,22 @@
     throws MalformedURLException, URLFilterException {
       newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
       newUrl = urlFilters.filter(newUrl);
+      
+      if (ignoreExternalLinks) {
+        try {
+          String origHost = new URL(urlString).getHost().toLowerCase();
+          String newHost = new URL(newUrl).getHost().toLowerCase();
+          if (!origHost.equals(newHost)) {
+            if (LOG.isDebugEnabled()) {
+              LOG.debug(" - ignoring redirect " + redirType + " from " +
+                          urlString + " to " + newUrl +
+                          " because external links are ignored");
+            }
+            return null;
+          }
+        } catch (MalformedURLException e) { }
+      }
+      
       if (newUrl != null && !newUrl.equals(urlString)) {
         reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
         url = new Text(newUrl);