You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/02/12 09:42:49 UTC
svn commit: r1659169 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/LinkDb.java
Author: markus
Date: Thu Feb 12 08:42:49 2015
New Revision: 1659169
URL: http://svn.apache.org/r1659169
Log:
NUTCH-1913 LinkDB to implement db.ignore.external.links
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659169&r1=1659168&r2=1659169&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:42:49 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel)
+
* NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
* NUTCH-1323 AjaxNormalizer (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1659169&r1=1659168&r2=1659169&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 12 08:42:49 2015
@@ -49,12 +49,14 @@ public class LinkDb extends Configured i
public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links";
+ public static final String IGNORE_EXTERNAL_LINKS = "db.ignore.external.links";
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
private int maxAnchorLength;
private boolean ignoreInternalLinks;
+ private boolean ignoreExternalLinks;
private URLFilters urlFilters;
private URLNormalizers urlNormalizers;
@@ -68,6 +70,8 @@ public class LinkDb extends Configured i
public void configure(JobConf job) {
maxAnchorLength = job.getInt("db.max.anchor.length", 100);
ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
+ ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
+
if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
urlFilters = new URLFilters(job);
}
@@ -115,6 +119,11 @@ public class LinkDb extends Configured i
if (toHost == null || toHost.equals(fromHost)) { // internal link
continue; // skip it
}
+ } else if (ignoreExternalLinks) {
+ String toHost = getHost(toUrl);
+ if (toHost == null || !toHost.equals(fromHost)) { // external link
+ continue; // skip it
+ }
}
if (urlNormalizers != null) {
try {
@@ -180,6 +189,15 @@ public class LinkDb extends Configured i
if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
LOG.info("LinkDb: internal links will be ignored.");
}
+ if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+ LOG.info("LinkDb: external links will be ignored.");
+ }
+ }
+ if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)
+ && job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+ LOG.warn("LinkDb: internal and external links are ignored! "
+ + "Nothing to do, actually. Exiting.");
+ return;
}
for (int i = 0; i < segments.length; i++) {
@@ -291,7 +309,6 @@ public class LinkDb extends Configured i
System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
return -1;
}
- Path segDir = null;
final FileSystem fs = FileSystem.get(getConf());
Path db = new Path(args[0]);
ArrayList<Path> segs = new ArrayList<Path>();