You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/06/05 22:12:49 UTC
svn commit: r411904 - in /lucene/nutch/trunk: conf/ src/plugin/
src/plugin/subcollection/ src/plugin/subcollection/src/
src/plugin/subcollection/src/java/ src/plugin/subcollection/src/java/org/
src/plugin/subcollection/src/java/org/apache/ src/plugin/s...
Author: siren
Date: Mon Jun 5 13:12:48 2006
New Revision: 411904
URL: http://svn.apache.org/viewvc?rev=411904&view=rev
Log:
NUTCH-201 add support for subcollections
Added:
lucene/nutch/trunk/conf/subcollections.xml.template
lucene/nutch/trunk/src/plugin/subcollection/
lucene/nutch/trunk/src/plugin/subcollection/README.txt
lucene/nutch/trunk/src/plugin/subcollection/build.xml
lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
lucene/nutch/trunk/src/plugin/subcollection/src/
lucene/nutch/trunk/src/plugin/subcollection/src/java/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/
lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java
lucene/nutch/trunk/src/plugin/subcollection/src/test/
Modified:
lucene/nutch/trunk/src/plugin/build.xml
Added: lucene/nutch/trunk/conf/subcollections.xml.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/subcollections.xml.template?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/subcollections.xml.template (added)
+++ lucene/nutch/trunk/conf/subcollections.xml.template Mon Jun 5 13:12:48 2006
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<subcollections>
+ <subcollection>
+ <name>nutch</name>
+ <id>nutch</id>
+ <whitelist>
+http://lucene.apache.org/nutch/
+http://wiki.apache.org/nutch/
+ </whitelist>
+ <blacklist />
+ </subcollection>
+</subcollections>
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=411904&r1=411903&r2=411904&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Jun 5 13:12:48 2006
@@ -52,6 +52,7 @@
<ant dir="query-url" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="summary-basic" target="deploy"/>
+ <ant dir="subcollection" target="deploy"/>
<ant dir="summary-lucene" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
<ant dir="urlfilter-prefix" target="deploy"/>
@@ -133,6 +134,7 @@
<ant dir="query-site" target="clean"/>
<ant dir="query-url" target="clean"/>
<ant dir="scoring-opic" target="clean"/>
+ <ant dir="subcollection" target="clean"/>
<ant dir="summary-basic" target="clean"/>
<ant dir="summary-lucene" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
@@ -140,5 +142,4 @@
<ant dir="urlfilter-regex" target="clean"/>
<ant dir="urlfilter-suffix" target="clean"/>
</target>
-
</project>
Added: lucene/nutch/trunk/src/plugin/subcollection/README.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/README.txt?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/README.txt Mon Jun 5 13:12:48 2006
@@ -0,0 +1,10 @@
+For brief description about this plugin see
+src/java/org/apache/nutch/collection/package.html
+
+Basically:
+You need to enable this during indexing and during searching
+
+After indexing you can limit your searches to certain
+subcollection with keyword subcollection, eg.
+
+"subcollection:nutch hadoop"
Added: lucene/nutch/trunk/src/plugin/subcollection/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/build.xml?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/build.xml Mon Jun 5 13:12:48 2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="subcollection" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
Added: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Mon Jun 5 13:12:48 2006
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="subcollection"
+ name="Subcollection indexing and query filter"
+ version="1.0.0"
+ provider-name="apache.org">
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <runtime>
+ <library name="subcollection.jar"/>
+ </runtime>
+
+ <extension id="org.apache.nutch.searcher.subcollection.query"
+ name="Subcollection Query Filter"
+ point="org.apache.nutch.searcher.QueryFilter">
+ <implementation id="SubcollectionQueryFilter"
+ class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter"
+ raw-fields="subcollection"/>
+ </extension>
+
+ <extension id="org.apache.nutch.indexer.subcollection.indexing"
+ name="Subcollection Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="SubcollectionIndexingFilter"
+ class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/>
+
+ </extension>
+</plugin>
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Mon Jun 5 13:12:48 2006
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.DomUtil;
+import org.apache.xerces.dom.DocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class CollectionManager extends Configured {
+
+ public static final String DEFAULT_FILE_NAME = "subcollections.xml";
+
+ static final Logger LOG = org.apache.hadoop.util.LogFormatter.getLogger(CollectionManager.class
+ .getName());
+
+ transient Map collectionMap = new HashMap();
+
+ transient URL configfile;
+
+ public CollectionManager(Configuration conf) {
+ super(conf);
+ init();
+ }
+
+ protected void init(){
+ try {
+ LOG.info("initializing CollectionManager");
+ // initialize known subcollections
+ configfile = getConf().getResource(
+ getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+
+ InputStream input = getConf().getConfResourceAsInputStream(
+ getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+ Element collections = DomUtil.getDom(input);
+
+ if (collections != null) {
+ NodeList nodeList = collections
+ .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+ LOG.info("file has" + nodeList.getLength() + " elements");
+
+ for (int i = 0; i < nodeList.getLength(); i++) {
+ Element scElem = (Element) nodeList.item(i);
+ Subcollection subCol = new Subcollection(getConf());
+ subCol.initialize(scElem);
+ collectionMap.put(subCol.name, subCol);
+ }
+ } else {
+ LOG.info("Cannot find collections");
+ }
+ } catch (Exception e) {
+ LOG.info("Error occured:" + e);
+ e.printStackTrace(System.out);
+ }
+ }
+
+ public static CollectionManager getCollectionManager(Configuration conf) {
+ String key = "collectionmanager";
+ CollectionManager impl = (CollectionManager)conf.getObject(key);
+ if (impl == null) {
+ try {
+ LOG.info("Instantiating CollectionManager");
+ impl=new CollectionManager(conf);
+ conf.setObject(key,impl);
+ } catch (Exception e) {
+ throw new RuntimeException("Couldn't create CollectionManager",e);
+ }
+ }
+ return impl;
+ }
+
+ /**
+ * Returns named subcollection
+ *
+ * @param id
+ * @return Named SubCollection (or null if not existing)
+ */
+ public Subcollection getSubColection(final String id) {
+ return (Subcollection) collectionMap.get(id);
+ }
+
+ /**
+ * Delete named subcollection
+ *
+ * @param id
+ * Id of SubCollection to delete
+ */
+ public void deleteSubCollection(final String id) throws IOException {
+ final Subcollection subCol = getSubColection(id);
+ if (subCol != null) {
+ collectionMap.remove(id);
+ }
+ }
+
+ /**
+ * Create a new subcollection.
+ *
+ * @param name
+ * Name of SubCollection to create
+ * @return Created SubCollection or null if allready existed
+ */
+ public Subcollection createSubCollection(final String id, final String name) {
+ Subcollection subCol = null;
+
+ if (!collectionMap.containsKey(id)) {
+ subCol = new Subcollection(id, name, getConf());
+ collectionMap.put(id, subCol);
+ }
+
+ return subCol;
+ }
+
+ /**
+ * Return names of collections url is part of
+ *
+ * @param url
+ * The url to test against Collections
+ * @return Space delimited string of collection names url is part of
+ */
+ public String getSubCollections(final String url) {
+ String collections = "";
+ final Iterator iterator = collectionMap.values().iterator();
+
+ while (iterator.hasNext()) {
+ final Subcollection subCol = (Subcollection) iterator.next();
+ if (subCol.filter(url) != null) {
+ collections += " " + subCol.name;
+ }
+ }
+ LOG.fine("subcollections:" + collections);
+
+ return collections;
+ }
+
+ /**
+ * Returns all collections
+ *
+ * @return All collections CollectionManager knows about
+ */
+ public Collection getAll() {
+ return collectionMap.values();
+ }
+
+ /**
+ * Save collections into file
+ *
+ * @throws Exception
+ */
+ public void save() throws IOException {
+ try {
+ final FileOutputStream fos = new FileOutputStream(new File(configfile
+ .getFile()));
+ final Document doc = new DocumentImpl();
+ final Element collections = doc
+ .createElement(Subcollection.TAG_COLLECTIONS);
+ final Iterator iterator = collectionMap.values().iterator();
+
+ while (iterator.hasNext()) {
+ final Subcollection subCol = (Subcollection) iterator.next();
+ final Element collection = doc
+ .createElement(Subcollection.TAG_COLLECTION);
+ collections.appendChild(collection);
+ final Element name = doc.createElement(Subcollection.TAG_NAME);
+ name.setNodeValue(subCol.getName());
+ collection.appendChild(name);
+ final Element whiteList = doc
+ .createElement(Subcollection.TAG_WHITELIST);
+ whiteList.setNodeValue(subCol.getWhiteListString());
+ collection.appendChild(whiteList);
+ final Element blackList = doc
+ .createElement(Subcollection.TAG_BLACKLIST);
+ blackList.setNodeValue(subCol.getBlackListString());
+ collection.appendChild(blackList);
+ }
+
+ DomUtil.saveDom(fos, collections);
+ fos.flush();
+ fos.close();
+ } catch (FileNotFoundException e) {
+ throw new IOException(e.toString());
+ }
+ }
+}
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Mon Jun 5 13:12:48 2006
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLFilter;
+import org.apache.xerces.util.DOMUtil;
+import org.w3c.dom.Element;
+
+/**
+ * SubCollection represents a subset of index, you can define url patterns that
+ * will indicate that particular page (url) is part of SubCollection.
+ */
+public class Subcollection extends Configured implements URLFilter{
+
+ public static final String TAG_COLLECTIONS="subcollections";
+ public static final String TAG_COLLECTION="subcollection";
+ public static final String TAG_WHITELIST="whitelist";
+ public static final String TAG_BLACKLIST="blacklist";
+ public static final String TAG_NAME="name";
+ public static final String TAG_ID="id";
+
+ ArrayList blackList = new ArrayList();
+
+ ArrayList whiteList = new ArrayList();
+
+ /**
+ * SubCollection identifier
+ */
+ String id;
+
+ /**
+ * SubCollection name
+ */
+ String name;
+
+ /**
+ * SubCollection whitelist as String
+ */
+ String wlString;
+
+ /**
+ * SubCollection blacklist as String
+ */
+ String blString;
+
+ /** public Constructor
+ *
+ * @param id id of SubCollection
+ * @param name name of SubCollection
+ */
+ public Subcollection(String id, String name, Configuration conf) {
+ this(conf);
+ this.id=id;
+ this.name = name;
+ }
+
+ public Subcollection(Configuration conf){
+ super(conf);
+ }
+
+ /**
+ * @return Returns the name
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * @return Returns the id
+ */
+ public String getId() {
+ return id;
+ }
+
+ /**
+ * Returns whitelist
+ *
+ * @return Whitelist entries
+ */
+ public ArrayList getWhiteList() {
+ return whiteList;
+ }
+
+ /**
+ * Returns whitelist String
+ *
+ * @return Whitelist String
+ */
+ public String getWhiteListString() {
+ return wlString;
+ }
+
+ /**
+ * Returns blacklist String
+ *
+ * @return Blacklist String
+ */
+ public String getBlackListString() {
+ return blString;
+ }
+
+ /**
+ * @param whiteList
+ * The whiteList to set.
+ */
+ public void setWhiteList(ArrayList whiteList) {
+ this.whiteList = whiteList;
+ }
+
+ /**
+ * Simple "indexOf" currentFilter for matching patterns.
+ *
+ * <pre>
+ * rules for evaluation are as follows:
+ * 1. if pattern matches in blacklist then url is rejected
+ * 2. if pattern matches in whitelist then url is allowed
+ * 3. url is rejected
+ * </pre>
+ *
+ * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
+ */
+ public String filter(String urlString) {
+ // first the blacklist
+ Iterator i = blackList.iterator();
+ while (i.hasNext()) {
+ String row = (String) i.next();
+ if (urlString.indexOf(row) != -1)
+ return null;
+ }
+
+ // then whitelist
+ i = whiteList.iterator();
+ while (i.hasNext()) {
+ String row = (String) i.next();
+ if (urlString.indexOf(row) != -1)
+ return urlString;
+ }
+ return null;
+ }
+
+ /**
+ * Initialize SubCollection from dom element
+ *
+ * @param collection
+ */
+ public void initialize(Element collection) {
+ this.name = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_NAME).item(0)).trim();
+ this.wlString = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
+ this.blString = DOMUtil.getChildText(
+ collection.getElementsByTagName(TAG_BLACKLIST).item(0)).trim();
+
+ parseList(this.whiteList, wlString);
+ parseList(this.blackList, blString);
+ }
+
+ /**
+ * Create a list of patterns from chunk of text, patterns are separated with
+ * newline
+ *
+ * @param list
+ * @param text
+ */
+ protected void parseList(ArrayList list, String text) {
+ list.clear();
+
+ StringTokenizer st = new StringTokenizer(text, "\n\r");
+
+ while (st.hasMoreElements()) {
+ String line = (String) st.nextElement();
+ list.add(line.trim());
+ }
+ }
+
+ /**
+ * Set contents of blacklist from String
+ *
+ * @param list the blacklist contents
+ */
+ public void setBlackList(String list) {
+ this.blString = list;
+ parseList(blackList, list);
+ }
+
+ /**
+ * Set contents of whitelist from String
+ *
+ * @param list the whitelist contents
+ */
+ public void setWhiteList(String list) {
+ this.wlString = list;
+ parseList(whiteList, list);
+ }
+}
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html Mon Jun 5 13:12:48 2006
@@ -0,0 +1,36 @@
+<html>
+<body>
+<p>
+Subcollection is a subset of an index. Subcollections are defined
+by urlpatterns in form of white/blacklist. So to get the page into
+subcollection it must match the whitelist and not the blacklist.
+</p>
+<p>
+Subcollection definitions are read from a file subcollections.xml
+and the format is as follows (imagine here that you are crawling all
+the virtualhosts from apache.org and you wan't to tag pages with
+url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/
+to be part of subcollection "nutch", this allows you to later search
+specifically from this subcollection)
+</p>
+<p/>
+<p/>
+<pre>
+<?xml version="1.0" encoding="UTF-8"?>
+<subcollections>
+ <subcollection>
+ <name>nutch</name>
+ <id>lucene</id>
+ <whitelist>http://lucene.apache.org/nutch</whitelist>
+ <whitelist>http://wiki.apache.org/nutch/</whitelist>
+ <blacklist />
+ </subcollection>
+</subcollections>
+</pre>
+</p>
+<p>Despite of this configuration you still can crawl any urls
+as long as they pass through your global url filters. (note that
+you must also seed your urls in normal nutch way)
+</p>
+</body>
+</html>
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Mon Jun 5 13:12:48 2006
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+
+import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import java.util.logging.Logger;
+
+public class SubcollectionIndexingFilter extends Configured implements IndexingFilter {
+
+ public SubcollectionIndexingFilter(){
+ super(NutchConfiguration.create());
+ }
+
+ public SubcollectionIndexingFilter(Configuration conf) {
+ super(conf);
+ }
+
+ /**
+ * Doc field name
+ */
+ public static final String FIELD_NAME = "subcollection";
+
+ /**
+ * Logger
+ */
+ public static final Logger LOG = LogFormatter
+ .getLogger(SubcollectionIndexingFilter.class.getName());
+
+ /**
+ * "Mark" document to be a part of subcollection
+ *
+ * @param doc
+ * @param url
+ */
+ private void addSubCollectionField(Document doc, String url) {
+ String collname = CollectionManager.getCollectionManager(getConf()).getSubCollections(url);
+ doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED));
+ }
+
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ String sUrl = url.toString();
+ addSubCollectionField(doc, sUrl);
+ return doc;
+ }
+}
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java Mon Jun 5 13:12:48 2006
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.searcher.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter;
+import org.apache.nutch.searcher.RawFieldQueryFilter;
+
+/** Handles "collection:" query clauses, causing them to search the "collection" field
+ * indexed by SubcollectionINdexingFilter. */
+public class SubcollectionQueryFilter extends RawFieldQueryFilter {
+ public SubcollectionQueryFilter() {
+ super(SubcollectionIndexingFilter.FIELD_NAME);
+ }
+
+ public void setConf(Configuration conf) {
+ // nothing to configure
+ }
+
+ public Configuration getConf() {
+ // nothing configured
+ return null;
+ }
+}
Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java Mon Jun 5 13:12:48 2006
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.Element;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class DomUtil {
+
+ /**
+ * Returns parsed dom tree or null if any error
+ *
+ * @param is
+ * @return
+ */
+ public static Element getDom(InputStream is) {
+
+ Element element = null;
+
+ DOMParser parser = new DOMParser();
+
+ InputSource input;
+ try {
+ input = new InputSource(is);
+ input.setEncoding("UTF-8");
+ parser.parse(input);
+ element = (Element) parser.getDocument().getChildNodes().item(0);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ return element;
+ }
+
+ /**
+ * save dom into ouputstream
+ *
+ * @param os
+ * @param e
+ */
+ public static void saveDom(OutputStream os, Element e) {
+
+ DOMSource source = new DOMSource(e);
+ TransformerFactory transFactory = TransformerFactory.newInstance();
+ Transformer transformer;
+ try {
+ transformer = transFactory.newTransformer();
+ transformer.setOutputProperty("indent", "yes");
+ StreamResult result = new StreamResult(os);
+ transformer.transform(source, result);
+ os.flush();
+ } catch (UnsupportedEncodingException e1) {
+ e1.printStackTrace();
+ } catch (IOException e1) {
+ e1.printStackTrace();
+ } catch (TransformerConfigurationException e2) {
+ e2.printStackTrace();
+ } catch (TransformerException ex) {
+ ex.printStackTrace();
+ }
+ }
+}