You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/06/10 21:30:35 UTC
svn commit: r413356 - in /lucene/nutch/trunk/src/plugin/subcollection: ./
src/java/org/apache/nutch/collection/ src/test/org/ src/test/org/apache/
src/test/org/apache/nutch/ src/test/org/apache/nutch/collection/
Author: siren
Date: Sat Jun 10 12:30:34 2006
New Revision: 413356
URL: http://svn.apache.org/viewvc?rev=413356&view=rev
Log:
fixed blugin.xml and a bug in Subcollection.java, added testcase to verify functionality
Added:
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/
lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
Modified:
lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
Modified: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Sat Jun 10 12:30:34 2006
@@ -10,15 +10,19 @@
</requires>
<runtime>
- <library name="subcollection.jar"/>
+ <library name="subcollection.jar">
+ <export name="*"/>
+ </library>
</runtime>
-
+
<extension id="org.apache.nutch.searcher.subcollection.query"
name="Subcollection Query Filter"
point="org.apache.nutch.searcher.QueryFilter">
<implementation id="SubcollectionQueryFilter"
- class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter"
- raw-fields="subcollection"/>
+ class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter">
+ <parameter name="raw-fields" value="subcollection"/>
+ </implementation>
+
</extension>
<extension id="org.apache.nutch.indexer.subcollection.indexing"
Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Sat Jun 10 12:30:34 2006
@@ -30,6 +30,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.util.DomUtil;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.xerces.dom.DocumentImpl;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -50,6 +51,13 @@
super(conf);
init();
}
+
+ /**
+ * Used for testing
+ */
+ protected CollectionManager(){
+ super(NutchConfiguration.create());
+ }
protected void init(){
try {
@@ -60,26 +68,30 @@
InputStream input = getConf().getConfResourceAsInputStream(
getConf().get("subcollections.config", DEFAULT_FILE_NAME));
- Element collections = DomUtil.getDom(input);
-
- if (collections != null) {
- NodeList nodeList = collections
- .getElementsByTagName(Subcollection.TAG_COLLECTION);
-
- LOG.info("file has" + nodeList.getLength() + " elements");
-
- for (int i = 0; i < nodeList.getLength(); i++) {
- Element scElem = (Element) nodeList.item(i);
- Subcollection subCol = new Subcollection(getConf());
- subCol.initialize(scElem);
- collectionMap.put(subCol.name, subCol);
- }
- } else {
- LOG.info("Cannot find collections");
- }
+ parse(input);
} catch (Exception e) {
LOG.info("Error occured:" + e);
e.printStackTrace(System.out);
+ }
+ }
+
+ protected void parse(InputStream input) {
+ Element collections = DomUtil.getDom(input);
+
+ if (collections != null) {
+ NodeList nodeList = collections
+ .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+ LOG.info("file has" + nodeList.getLength() + " elements");
+
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Element scElem = (Element) nodeList.item(i);
+ Subcollection subCol = new Subcollection(getConf());
+ subCol.initialize(scElem);
+ collectionMap.put(subCol.name, subCol);
+ }
+ } else {
+ LOG.info("Cannot find collections");
}
}
Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=413356&r1=413355&r2=413356&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Sat Jun 10 12:30:34 2006
@@ -158,11 +158,13 @@
}
/**
- * Initialize SubCollection from dom element
+ * Initialize Subcollection from dom element
*
* @param collection
*/
public void initialize(Element collection) {
+ this.id = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_ID).item(0)).trim();
this.name = DOMUtil.getChildText(
collection.getElementsByTagName(TAG_NAME).item(0)).trim();
this.wlString = DOMUtil.getChildText(
Added: lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=413356&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Sat Jun 10 12:30:34 2006
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Collection;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestSubcollection extends TestCase {
+
+ /**Test filtering logic
+ *
+ * @throws Exception
+ */
+ public void testFilter() throws Exception {
+ Subcollection sc=new Subcollection(NutchConfiguration.create());
+ sc.setWhiteList("www.nutch.org\nwww.apache.org");
+ sc.setBlackList("jpg\nwww.apache.org/zecret/");
+
+ //matches whitelist
+ assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
+
+ //matches blacklist
+ assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
+ assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
+
+ //no match
+ assertEquals(null, sc.filter("http://www.google.com/"));
+ }
+
+ public void testInput(){
+ StringBuffer xml=new StringBuffer();
+ xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+ xml.append("<subcollections>");
+ xml.append("<subcollection>");
+ xml.append("<name>nutch collection</name>");
+ xml.append("<id>nutch</id>");
+ xml.append("<whitelist>");
+ xml.append("http://lucene.apache.org/nutch/\n");
+ xml.append("http://wiki.apache.org/nutch/\n");
+ xml.append("</whitelist>");
+ xml.append("<blacklist>");
+ xml.append("http://www.xxx.yyy\n");
+ xml.append("</blacklist>");
+ xml.append("</subcollection>");
+ xml.append("</subcollections>");
+
+ InputStream is=new ByteArrayInputStream(xml.toString().getBytes());
+
+ CollectionManager cm=new CollectionManager();
+ cm.parse(is);
+
+ Collection c=cm.getAll();
+
+ // test that size matches
+ assertEquals(1,c.size());
+
+ Subcollection collection=(Subcollection)c.toArray()[0];
+
+ //test collection id
+ assertEquals("nutch", collection.getId());
+
+ //test collection name
+ assertEquals("nutch collection", collection.getName());
+
+ //test whitelist
+ assertEquals(2,collection.whiteList.size());
+
+ String wlUrl=(String)collection.whiteList.get(0);
+ assertEquals("http://lucene.apache.org/nutch/", wlUrl);
+
+ wlUrl=(String)collection.whiteList.get(1);
+ assertEquals("http://wiki.apache.org/nutch/", wlUrl);
+
+ //matches whitelist
+ assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/"));
+
+ //test blacklist
+ assertEquals(1,collection.blackList.size());
+
+ String blUrl=(String)collection.blackList.get(0);
+ assertEquals("http://www.xxx.yyy", blUrl);
+
+ //no match
+ assertEquals(null, collection.filter("http://www.google.com/"));
+ }
+}