You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2011/04/12 14:51:16 UTC

svn commit: r1091390 - in /nutch/branches/branch-1.3: CHANGES.txt src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java

Author: markus
Date: Tue Apr 12 12:51:16 2011
New Revision: 1091390

URL: http://svn.apache.org/viewvc?rev=1091390&view=rev
Log:
NUTCH-891 Subcollection plugin won't require blacklist any more (markus)

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1091390&r1=1091389&r2=1091390&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Tue Apr 12 12:51:16 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - Current Development
 
+* NUTCH-891 Subcollection plugin won't require blacklist any more (markus)
+
 * NUTCH-972 CrawlDbMerger doesn't break on non-existent input (Gabriele Kahlout via jnioche)
 
 * NUTCH-967 Upgrade to Tika 0.9 (jnioche)

Modified: nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1091390&r1=1091389&r2=1091390&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ nutch/branches/branch-1.3/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Tue Apr 12 12:51:16 2011
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configured
 import org.apache.nutch.net.URLFilter;
 import org.apache.xerces.util.DOMUtil;
 import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 
 /**
  * SubCollection represents a subset of index, you can define url patterns that
@@ -170,11 +171,15 @@ public class Subcollection extends Confi
         collection.getElementsByTagName(TAG_NAME).item(0)).trim();
     this.wlString = DOMUtil.getChildText(
         collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
-    this.blString = DOMUtil.getChildText(
-        collection.getElementsByTagName(TAG_BLACKLIST).item(0)).trim();
 
     parseList(this.whiteList, wlString);
-    parseList(this.blackList, blString);
+
+    // Check if there's a blacklist we need to parse
+    NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
+    if (nodeList.getLength() > 0) {
+      this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
+      parseList(this.blackList, blString);
+    }
   }
 
   /**