You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/08 09:03:38 UTC
svn commit: r1347897 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/fetcher/Fetcher.java
Author: markus
Date: Fri Jun 8 07:03:38 2012
New Revision: 1347897
URL: http://svn.apache.org/viewvc?rev=1347897&view=rev
Log:
NUTCH-1346 Follow outlinks to ignore external
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 8 07:03:38 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1346 Follow outlinks to ignore external (markus)
+
* NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)
* NUTCH-1351 DomainStatistics to aggregate by TLD (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 8 07:03:38 2012
@@ -801,6 +801,14 @@
</description>
</property>
+<property>
+ <name>fetcher.follow.outlinks.ignore.external</name>
+ <value>true</value>
+ <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks
+ in the output but not follow them. If db.ignore.external.links is true this directive is ignored.
+ </description>
+</property>
+
<!-- moreindexingfilter plugin properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1347897&r1=1347896&r2=1347897&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Jun 8 07:03:38 2012
@@ -588,6 +588,8 @@ public class Fetcher extends Configured
private final int interval;
private int maxOutlinkDepth;
private int maxOutlinkDepthNumLinks;
+ private boolean outlinksIgnoreExternal;
+
private int outlinksDepthDivisor;
private boolean skipTruncated;
@@ -619,6 +621,7 @@ public class Fetcher extends Configured
interval = conf.getInt("db.fetch.interval.default", 2592000);
ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
+ outlinksIgnoreExternal = conf.getBoolean("fetcher.follow.outlinks.ignore.external", false);
maxOutlinkDepthNumLinks = conf.getInt("fetcher.follow.outlinks.num.links", 4);
outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);
}
@@ -1051,13 +1054,24 @@ public class Fetcher extends Configured
// Calculate variable number of outlinks by depth using the divisor (outlinks = Math.floor(divisor / depth * num.links))
int maxOutlinksByDepth = (int)Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
+ String followUrl;
+
// Walk over the outlinks and add as new FetchItem to the queues
Iterator<String> iter = outlinks.iterator();
while(iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
+ followUrl = iter.next();
+
+ // Check whether we'll follow external outlinks
+ if (outlinksIgnoreExternal) {
+ if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
+ continue;
+ }
+ }
+
reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);
// Create new FetchItem with depth incremented
- FetchItem fit = FetchItem.create(new Text(iter.next()), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
+ FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
fetchQueues.addFetchItem(fit);
outlinkCounter++;