You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/08 09:03:38 UTC

svn commit: r1347897 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/Fetcher.java

Author: markus
Date: Fri Jun  8 07:03:38 2012
New Revision: 1347897

URL: http://svn.apache.org/viewvc?rev=1347897&view=rev
Log:
NUTCH-1346 Follow outlinks to ignore external

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun  8 07:03:38 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1346 Follow outlinks to ignore external (markus)
+
 * NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)
 
 * NUTCH-1351 DomainStatistics to aggregate by TLD (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun  8 07:03:38 2012
@@ -801,6 +801,14 @@
   </description>
 </property>
 
+<property>
+  <name>fetcher.follow.outlinks.ignore.external</name>
+  <value>true</value>  
+  <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks
+  in the output but not follow them. If db.ignore.external.links is true this directive is ignored.
+  </description>
+</property>
+
 <!-- moreindexingfilter plugin properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Jun  8 07:03:38 2012
@@ -588,6 +588,8 @@ public class Fetcher extends Configured 
     private final int interval;
     private int maxOutlinkDepth;
     private int maxOutlinkDepthNumLinks;
+    private boolean outlinksIgnoreExternal;
+
     private int outlinksDepthDivisor;
     private boolean skipTruncated;
 
@@ -619,6 +621,7 @@ public class Fetcher extends Configured 
       interval = conf.getInt("db.fetch.interval.default", 2592000);
       ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
       maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
+      outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false);
       maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4);
       outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);
     }
@@ -1051,13 +1054,24 @@ public class Fetcher extends Configured 
               // Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
               int maxOutlinksByDepth = (int)Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
 
+              String followUrl;
+
               // Walk over the outlinks and add as new FetchItem to the queues
               Iterator<String> iter = outlinks.iterator();
               while(iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
+                followUrl = iter.next();
+
+                // Check whether we'll follow external outlinks
+                if (outlinksIgnoreExternal) {
+                  if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
+                    continue;
+                  }
+                }
+
                 reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);
 
                 // Create new FetchItem with depth incremented
-                FetchItem fit = FetchItem.create(new Text(iter.next()), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
+                FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                 fetchQueues.addFetchItem(fit);
 
                 outlinkCounter++;