You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/02/12 09:42:49 UTC

svn commit: r1659169 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDb.java

Author: markus
Date: Thu Feb 12 08:42:49 2015
New Revision: 1659169

URL: http://svn.apache.org/r1659169
Log:
NUTCH-1913 LinkDB to implement db.ignore.external.links

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659169&r1=1659168&r2=1659169&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:42:49 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
 
 * NUTCH-1323 AjaxNormalizer (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1659169&r1=1659168&r2=1659169&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 12 08:42:49 2015
@@ -49,12 +49,14 @@ public class LinkDb extends Configured i
   public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
 
   public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links";
+  public static final String IGNORE_EXTERNAL_LINKS = "db.ignore.external.links";
 
   public static final String CURRENT_NAME = "current";
   public static final String LOCK_NAME = ".locked";
 
   private int maxAnchorLength;
   private boolean ignoreInternalLinks;
+  private boolean ignoreExternalLinks;
   private URLFilters urlFilters;
   private URLNormalizers urlNormalizers;
 
@@ -68,6 +70,8 @@ public class LinkDb extends Configured i
   public void configure(JobConf job) {
     maxAnchorLength = job.getInt("db.max.anchor.length", 100);
     ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
+    ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
+
     if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
       urlFilters = new URLFilters(job);
     }
@@ -115,6 +119,11 @@ public class LinkDb extends Configured i
         if (toHost == null || toHost.equals(fromHost)) { // internal link
           continue; // skip it
         }
+      } else if (ignoreExternalLinks) {
+        String toHost = getHost(toUrl);
+        if (toHost == null || !toHost.equals(fromHost)) { // external link
+          continue;                               // skip it
+        }
       }
       if (urlNormalizers != null) {
         try {
@@ -180,6 +189,15 @@ public class LinkDb extends Configured i
       if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
         LOG.info("LinkDb: internal links will be ignored.");
       }
+      if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+        LOG.info("LinkDb: external links will be ignored.");
+      }
+    }
+    if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)
+        && job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+      LOG.warn("LinkDb: internal and external links are ignored! "
+          + "Nothing to do, actually. Exiting.");
+      return;
     }
 
     for (int i = 0; i < segments.length; i++) {
@@ -291,7 +309,6 @@ public class LinkDb extends Configured i
       System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
       return -1;
     }
-    Path segDir = null;
     final FileSystem fs = FileSystem.get(getConf());
     Path db = new Path(args[0]);
     ArrayList<Path> segs = new ArrayList<Path>();