You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/01/05 20:45:34 UTC
svn commit: r1055604 - in /nutch/branches/branch-1.3: CHANGES.txt
src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
Author: jnioche
Date: Wed Jan 5 19:45:33 2011
New Revision: 1055604
URL: http://svn.apache.org/viewvc?rev=1055604&view=rev
Log:
NUTCH-950 DomainURLFilter throws NPE on bogus urls
Modified:
nutch/branches/branch-1.3/CHANGES.txt
nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1055604&r1=1055603&r2=1055604&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Wed Jan 5 19:45:33 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.3 - Current Development
+* NUTCH-950 DomainURLFilter throws NPE on bogus urls (Alexis Detreglode via jnioche)
+
* NUTCH-935 basicurlnormalizer removes unnecessary /./ in URLs
* NUTCH-912 MoreIndexingFilter does not parse docx and xlsx date formats (Markus Jelsma, jnioche)
Modified: nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1055604&r1=1055603&r2=1055604&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Wed Jan 5 19:45:33 2011
@@ -31,6 +31,7 @@ import org.apache.nutch.net.URLFilter;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
/**
* <p>Filters URLs based on a file containing domain suffixes, domain names, and
@@ -170,9 +171,14 @@ public class DomainURLFilter
// match for suffix, domain, and host in that order. more general will
// override more specific
- String suffix = URLUtil.getDomainSuffix(url).getDomain();
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
+ String suffix = null;
+ DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+ if (domainSuffix != null) {
+ suffix = domainSuffix.getDomain();
+ }
+
if (domainSet.contains(suffix) || domainSet.contains(domain)
|| domainSet.contains(host)) {
return url;