You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/06/10 21:30:35 UTC

svn commit: r413356 - in /lucene/nutch/trunk/src/plugin/subcollection: ./ src/java/org/apache/nutch/collection/ src/test/org/ src/test/org/apache/ src/test/org/apache/nutch/ src/test/org/apache/nutch/collection/

Author: siren
Date: Sat Jun 10 12:30:34 2006
New Revision: 413356

URL: http://svn.apache.org/viewvc?rev=413356&view=rev
Log:
fixed blugin.xml and a bug in Subcollection.java, added testcase to verify functionality

Added:
    lucene/nutch/trunk/src/plugin/subcollection/src/test/org/
    lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/
    lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
Modified:
    lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java

Modified: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Sat Jun 10 12:30:34 2006
@@ -10,15 +10,19 @@
    </requires>
 
    <runtime>
-      <library name="subcollection.jar"/>
+      <library name="subcollection.jar">
+         <export name="*"/>
+      </library>
    </runtime>
-
+   
    <extension id="org.apache.nutch.searcher.subcollection.query"
               name="Subcollection Query Filter"
               point="org.apache.nutch.searcher.QueryFilter">
     <implementation id="SubcollectionQueryFilter"
-               class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter"
-               raw-fields="subcollection"/>
+               class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter">
+         <parameter name="raw-fields" value="subcollection"/>
+         </implementation>
+               
    </extension>      
 
    <extension id="org.apache.nutch.indexer.subcollection.indexing"

Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Sat Jun 10 12:30:34 2006
@@ -30,6 +30,7 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.nutch.util.DomUtil;
+import org.apache.nutch.util.NutchConfiguration;
 import org.apache.xerces.dom.DocumentImpl;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -50,6 +51,13 @@
     super(conf);
     init();
   }
+  
+  /** 
+   * Used for testing
+   */
+  protected CollectionManager(){
+    super(NutchConfiguration.create());
+  }
 
   protected void init(){
     try {
@@ -60,26 +68,30 @@
 
       InputStream input = getConf().getConfResourceAsInputStream(
           getConf().get("subcollections.config", DEFAULT_FILE_NAME));
-      Element collections = DomUtil.getDom(input);
-
-      if (collections != null) {
-        NodeList nodeList = collections
-            .getElementsByTagName(Subcollection.TAG_COLLECTION);
-
-        LOG.info("file has" + nodeList.getLength() + " elements");
-        
-        for (int i = 0; i < nodeList.getLength(); i++) {
-          Element scElem = (Element) nodeList.item(i);
-          Subcollection subCol = new Subcollection(getConf());
-          subCol.initialize(scElem);
-          collectionMap.put(subCol.name, subCol);
-        }
-      } else {
-        LOG.info("Cannot find collections");
-      }
+      parse(input);
     } catch (Exception e) {
       LOG.info("Error occured:" + e);
       e.printStackTrace(System.out);
+    }
+  }
+
+  protected void parse(InputStream input) {
+    Element collections = DomUtil.getDom(input);
+
+    if (collections != null) {
+      NodeList nodeList = collections
+          .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+      LOG.info("file has" + nodeList.getLength() + " elements");
+      
+      for (int i = 0; i < nodeList.getLength(); i++) {
+        Element scElem = (Element) nodeList.item(i);
+        Subcollection subCol = new Subcollection(getConf());
+        subCol.initialize(scElem);
+        collectionMap.put(subCol.name, subCol);
+      }
+    } else {
+      LOG.info("Cannot find collections");
     }
   }
   

Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Sat Jun 10 12:30:34 2006
@@ -158,11 +158,13 @@
   }
 
   /**
-   * Initialize SubCollection from dom element
+   * Initialize Subcollection from dom element
    * 
    * @param collection
    */
   public void initialize(Element collection) {
+    this.id = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_ID).item(0)).trim();
     this.name = DOMUtil.getChildText(
         collection.getElementsByTagName(TAG_NAME).item(0)).trim();
     this.wlString = DOMUtil.getChildText(

Added: lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=413356&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Sat Jun 10 12:30:34 2006
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Collection;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSubcollection extends TestCase {
+  
+  /**Test filtering logic
+   * 
+   * @throws Exception
+   */
+  public void testFilter() throws Exception {
+    Subcollection sc=new Subcollection(NutchConfiguration.create());
+    sc.setWhiteList("www.nutch.org\nwww.apache.org");
+    sc.setBlackList("jpg\nwww.apache.org/zecret/");
+    
+    //matches whitelist
+    assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
+    
+    //matches blacklist
+    assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
+    assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
+    
+    //no match
+    assertEquals(null, sc.filter("http://www.google.com/"));
+  }
+  
+  public void testInput(){
+    StringBuffer xml=new StringBuffer();
+    xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+    xml.append("<subcollections>");
+    xml.append("<subcollection>");
+    xml.append("<name>nutch collection</name>");
+    xml.append("<id>nutch</id>");
+    xml.append("<whitelist>");
+    xml.append("http://lucene.apache.org/nutch/\n");
+    xml.append("http://wiki.apache.org/nutch/\n");
+    xml.append("</whitelist>");
+    xml.append("<blacklist>");
+    xml.append("http://www.xxx.yyy\n");
+    xml.append("</blacklist>");
+    xml.append("</subcollection>");
+    xml.append("</subcollections>");
+    
+    InputStream is=new ByteArrayInputStream(xml.toString().getBytes());
+    
+    CollectionManager cm=new CollectionManager();
+    cm.parse(is);
+    
+    Collection c=cm.getAll();
+    
+    // test that size matches
+    assertEquals(1,c.size());
+    
+    Subcollection collection=(Subcollection)c.toArray()[0];
+    
+    //test collection id
+    assertEquals("nutch", collection.getId());
+    
+    //test collection name
+    assertEquals("nutch collection", collection.getName());
+
+    //test whitelist
+    assertEquals(2,collection.whiteList.size());
+    
+    String wlUrl=(String)collection.whiteList.get(0);
+    assertEquals("http://lucene.apache.org/nutch/", wlUrl);
+
+    wlUrl=(String)collection.whiteList.get(1);
+    assertEquals("http://wiki.apache.org/nutch/", wlUrl);
+    
+    //matches whitelist
+    assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/"));
+
+    //test blacklist
+    assertEquals(1,collection.blackList.size());
+
+    String blUrl=(String)collection.blackList.get(0);
+    assertEquals("http://www.xxx.yyy", blUrl);
+
+    //no match
+    assertEquals(null, collection.filter("http://www.google.com/"));
+  }
+}