You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/05 20:45:34 UTC

svn commit: r1055604 - in /nutch/branches/branch-1.3: CHANGES.txt src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java

Author: jnioche
Date: Wed Jan  5 19:45:33 2011
New Revision: 1055604

URL: http://svn.apache.org/viewvc?rev=1055604&view=rev
Log:
NUTCH-950 DomainURLFilter throws NPE on bogus urls

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055604&r1=1055603&r2=1055604&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Wed Jan  5 19:45:33 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-950 DomainURLFilter throws NPE on bogus urls (Alexis Detreglode via jnioche)
+
 * NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs
 
 * NUTCH-912 MoreIndexingFilter does not parse docx and xlsx date formats (Markus Jelsma, jnioche)

Modified: nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1055604&r1=1055603&r2=1055604&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Wed Jan  5 19:45:33 2011
@@ -31,6 +31,7 @@ import org.apache.nutch.net.URLFilter;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
 
 /**
  * <p>Filters URLs based on a file containing domain suffixes, domain names, and
@@ -170,9 +171,14 @@ public class DomainURLFilter
 
       // match for suffix, domain, and host in that order.  more general will
       // override more specific
-      String suffix = URLUtil.getDomainSuffix(url).getDomain();
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();
       String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+      
       if (domainSet.contains(suffix) || domainSet.contains(domain)
         || domainSet.contains(host)) {
         return url;