You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/12/24 13:45:27 UTC

svn commit: r1721615 - in /nutch/trunk: CHANGES.txt src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

Author: markus
Date: Thu Dec 24 12:45:27 2015
New Revision: 1721615

URL: http://svn.apache.org/viewvc?rev=1721615&view=rev
Log:
NUTCH-2189 Domain filter must deactivate if no rules are present

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
    nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1721615&r1=1721614&r2=1721615&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 24 12:45:27 2015
@@ -1,6 +1,8 @@
 Nutch Change Log
 
-* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency
+* NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
+
+* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce)
 
 * NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present in segments directory (lewismc)
 

Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1721615&r1=1721614&r2=1721615&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Dec 24 12:45:27 2015
@@ -180,9 +180,10 @@ public class DomainURLFilter implements
   }
 
   public String filter(String url) {
-
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    if (domainSet.size() == 0) return url;
+    
     try {
-
       // match for suffix, domain, and host in that order. more general will
       // override more specific
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();

Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1721615&r1=1721614&r2=1721615&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Dec 24 12:45:27 2015
@@ -44,5 +44,24 @@ public class TestDomainURLFilter {
     Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
     Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
   }
+  
+  @Test
+  public void testNoFilter() throws Exception {
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
+  }
 
 }