You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/06/05 22:12:49 UTC

svn commit: r411904 - in /lucene/nutch/trunk: conf/ src/plugin/ src/plugin/subcollection/ src/plugin/subcollection/src/ src/plugin/subcollection/src/java/ src/plugin/subcollection/src/java/org/ src/plugin/subcollection/src/java/org/apache/ src/plugin/s...

Author: siren
Date: Mon Jun  5 13:12:48 2006
New Revision: 411904

URL: http://svn.apache.org/viewvc?rev=411904&view=rev
Log:
NUTCH-201 add support for subcollections

Added:
    lucene/nutch/trunk/conf/subcollections.xml.template
    lucene/nutch/trunk/src/plugin/subcollection/
    lucene/nutch/trunk/src/plugin/subcollection/README.txt
    lucene/nutch/trunk/src/plugin/subcollection/build.xml
    lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
    lucene/nutch/trunk/src/plugin/subcollection/src/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/
    lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java
    lucene/nutch/trunk/src/plugin/subcollection/src/test/
Modified:
    lucene/nutch/trunk/src/plugin/build.xml

Added: lucene/nutch/trunk/conf/subcollections.xml.template
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/subcollections.xml.template?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/subcollections.xml.template (added)
+++ lucene/nutch/trunk/conf/subcollections.xml.template Mon Jun  5 13:12:48 2006
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<subcollections>
+	<subcollection>
+		<name>nutch</name>
+		<id>nutch</id>
+		<whitelist>
+http://lucene.apache.org/nutch/
+http://wiki.apache.org/nutch/
+                </whitelist>
+		<blacklist />
+	</subcollection>
+</subcollections>

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=411904&r1=411903&r2=411904&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Mon Jun  5 13:12:48 2006
@@ -52,6 +52,7 @@
      <ant dir="query-url" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="summary-basic" target="deploy"/>
+     <ant dir="subcollection" target="deploy"/>
      <ant dir="summary-lucene" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
@@ -133,6 +134,7 @@
     <ant dir="query-site" target="clean"/>
     <ant dir="query-url" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
+    <ant dir="subcollection" target="clean"/>
     <ant dir="summary-basic" target="clean"/>
     <ant dir="summary-lucene" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
@@ -140,5 +142,4 @@
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>
   </target>
-
 </project>

Added: lucene/nutch/trunk/src/plugin/subcollection/README.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/README.txt?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/README.txt (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/README.txt Mon Jun  5 13:12:48 2006
@@ -0,0 +1,10 @@
+For brief description about this plugin see
+src/java/org/apache/nutch/collection/package.html
+
+Basically:
+You need to enable this during indexing and during searching
+
+After indexing you can limit your searches to certain
+subcollection with keyword subcollection, eg. 
+
+"subcollection:nutch hadoop"

Added: lucene/nutch/trunk/src/plugin/subcollection/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/build.xml?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/build.xml Mon Jun  5 13:12:48 2006
@@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+
+<project name="subcollection" default="jar">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Mon Jun  5 13:12:48 2006
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="subcollection"
+   name="Subcollection indexing and query filter"
+   version="1.0.0"
+   provider-name="apache.org">
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <runtime>
+      <library name="subcollection.jar"/>
+   </runtime>
+
+   <extension id="org.apache.nutch.searcher.subcollection.query"
+              name="Subcollection Query Filter"
+              point="org.apache.nutch.searcher.QueryFilter">
+    <implementation id="SubcollectionQueryFilter"
+               class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter"
+               raw-fields="subcollection"/>
+   </extension>      
+
+   <extension id="org.apache.nutch.indexer.subcollection.indexing"
+              name="Subcollection Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="SubcollectionIndexingFilter"
+                      class="org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter"/>
+                      
+   </extension>
+</plugin>

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Mon Jun  5 13:12:48 2006
@@ -0,0 +1,212 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.util.DomUtil;
+import org.apache.xerces.dom.DocumentImpl;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class CollectionManager extends Configured {
+
+  public static final String DEFAULT_FILE_NAME = "subcollections.xml";
+
+  static final Logger LOG = org.apache.hadoop.util.LogFormatter.getLogger(CollectionManager.class
+      .getName());
+
+  transient Map collectionMap = new HashMap();
+
+  transient URL configfile;
+  
+  public CollectionManager(Configuration conf) {
+    super(conf);
+    init();
+  }
+
+  protected void init(){
+    try {
+      LOG.info("initializing CollectionManager");
+      // initialize known subcollections
+      configfile = getConf().getResource(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+
+      InputStream input = getConf().getConfResourceAsInputStream(
+          getConf().get("subcollections.config", DEFAULT_FILE_NAME));
+      Element collections = DomUtil.getDom(input);
+
+      if (collections != null) {
+        NodeList nodeList = collections
+            .getElementsByTagName(Subcollection.TAG_COLLECTION);
+
+        LOG.info("file has" + nodeList.getLength() + " elements");
+        
+        for (int i = 0; i < nodeList.getLength(); i++) {
+          Element scElem = (Element) nodeList.item(i);
+          Subcollection subCol = new Subcollection(getConf());
+          subCol.initialize(scElem);
+          collectionMap.put(subCol.name, subCol);
+        }
+      } else {
+        LOG.info("Cannot find collections");
+      }
+    } catch (Exception e) {
+      LOG.info("Error occured:" + e);
+      e.printStackTrace(System.out);
+    }
+  }
+  
+  public static CollectionManager getCollectionManager(Configuration conf) {
+    String key = "collectionmanager";
+    CollectionManager impl = (CollectionManager)conf.getObject(key);
+    if (impl == null) {
+      try {
+        LOG.info("Instantiating CollectionManager");
+        impl=new CollectionManager(conf);
+        conf.setObject(key,impl);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create CollectionManager",e);
+      }
+    }
+    return impl;
+  }
+
+  /**
+   * Returns named subcollection
+   * 
+   * @param id
+   * @return Named SubCollection (or null if not existing)
+   */
+  public Subcollection getSubColection(final String id) {
+    return (Subcollection) collectionMap.get(id);
+  }
+
+  /**
+   * Delete named subcollection
+   * 
+   * @param id
+   *          Id of SubCollection to delete
+   */
+  public void deleteSubCollection(final String id) throws IOException {
+    final Subcollection subCol = getSubColection(id);
+    if (subCol != null) {
+      collectionMap.remove(id);
+    }
+  }
+
+  /**
+   * Create a new subcollection.
+   * 
+   * @param name
+   *          Name of SubCollection to create
+   * @return Created SubCollection or null if allready existed
+   */
+  public Subcollection createSubCollection(final String id, final String name) {
+    Subcollection subCol = null;
+
+    if (!collectionMap.containsKey(id)) {
+      subCol = new Subcollection(id, name, getConf());
+      collectionMap.put(id, subCol);
+    }
+
+    return subCol;
+  }
+
+  /**
+   * Return names of collections url is part of
+   * 
+   * @param url
+   *          The url to test against Collections
+   * @return Space delimited string of collection names url is part of
+   */
+  public String getSubCollections(final String url) {
+    String collections = "";
+    final Iterator iterator = collectionMap.values().iterator();
+
+    while (iterator.hasNext()) {
+      final Subcollection subCol = (Subcollection) iterator.next();
+      if (subCol.filter(url) != null) {
+        collections += " " + subCol.name;
+      }
+    }
+    LOG.fine("subcollections:" + collections);
+    
+    return collections;
+  }
+
+  /**
+   * Returns all collections
+   * 
+   * @return All collections CollectionManager knows about
+   */
+  public Collection getAll() {
+    return collectionMap.values();
+  }
+
+  /**
+   * Save collections into file
+   * 
+   * @throws Exception
+   */
+  public void save() throws IOException {
+    try {
+      final FileOutputStream fos = new FileOutputStream(new File(configfile
+          .getFile()));
+      final Document doc = new DocumentImpl();
+      final Element collections = doc
+          .createElement(Subcollection.TAG_COLLECTIONS);
+      final Iterator iterator = collectionMap.values().iterator();
+
+      while (iterator.hasNext()) {
+        final Subcollection subCol = (Subcollection) iterator.next();
+        final Element collection = doc
+            .createElement(Subcollection.TAG_COLLECTION);
+        collections.appendChild(collection);
+        final Element name = doc.createElement(Subcollection.TAG_NAME);
+        name.setNodeValue(subCol.getName());
+        collection.appendChild(name);
+        final Element whiteList = doc
+            .createElement(Subcollection.TAG_WHITELIST);
+        whiteList.setNodeValue(subCol.getWhiteListString());
+        collection.appendChild(whiteList);
+        final Element blackList = doc
+            .createElement(Subcollection.TAG_BLACKLIST);
+        blackList.setNodeValue(subCol.getBlackListString());
+        collection.appendChild(blackList);
+      }
+
+      DomUtil.saveDom(fos, collections);
+      fos.flush();
+      fos.close();
+    } catch (FileNotFoundException e) {
+      throw new IOException(e.toString());
+    }
+  }
+}

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Mon Jun  5 13:12:48 2006
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLFilter;
+import org.apache.xerces.util.DOMUtil;
+import org.w3c.dom.Element;
+
+/**
+ * SubCollection represents a subset of index, you can define url patterns that
+ * will indicate that particular page (url) is part of SubCollection.
+ */
+public class Subcollection extends Configured implements URLFilter{
+  
+  public static final String TAG_COLLECTIONS="subcollections";
+  public static final String TAG_COLLECTION="subcollection";
+  public static final String TAG_WHITELIST="whitelist";
+  public static final String TAG_BLACKLIST="blacklist";
+  public static final String TAG_NAME="name";
+  public static final String TAG_ID="id";
+
+  ArrayList blackList = new ArrayList();
+
+  ArrayList whiteList = new ArrayList();
+
+  /** 
+   * SubCollection identifier
+   */
+  String id;
+
+  /** 
+   * SubCollection name
+   */
+  String name;
+
+  /** 
+   * SubCollection whitelist as String
+   */
+  String wlString;
+
+  /**
+   * SubCollection blacklist as String
+   */
+  String blString;
+
+  /** public Constructor
+   * 
+   * @param id id of SubCollection
+   * @param name name of SubCollection
+   */
+  public Subcollection(String id, String name, Configuration conf) {
+    this(conf);
+    this.id=id;
+    this.name = name;
+  }
+
+  public Subcollection(Configuration conf){
+    super(conf);
+  }
+  
+  /**
+   * @return Returns the name
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * @return Returns the id
+   */
+  public String getId() {
+    return id;
+  }
+
+  /**
+   * Returns whitelist
+   * 
+   * @return Whitelist entries
+   */
+  public ArrayList getWhiteList() {
+    return whiteList;
+  }
+
+  /**
+   * Returns whitelist String
+   * 
+   * @return Whitelist String
+   */
+  public String getWhiteListString() {
+    return wlString;
+  }
+
+  /**
+   * Returns blacklist String
+   * 
+   * @return Blacklist String
+   */
+  public String getBlackListString() {
+    return blString;
+  }
+
+  /**
+   * @param whiteList
+   *          The whiteList to set.
+   */
+  public void setWhiteList(ArrayList whiteList) {
+    this.whiteList = whiteList;
+  }
+
+  /**
+   * Simple "indexOf" currentFilter for matching patterns.
+   * 
+   * <pre>
+   *  rules for evaluation are as follows:
+   *  1. if pattern matches in blacklist then url is rejected
+   *  2. if pattern matches in whitelist then url is allowed
+   *  3. url is rejected
+   * </pre>
+   * 
+   * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
+   */
+  public String filter(String urlString) {
+    // first the blacklist
+    Iterator i = blackList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.indexOf(row) != -1)
+        return null;
+    }
+
+    // then whitelist
+    i = whiteList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.indexOf(row) != -1)
+        return urlString;
+    }
+    return null;
+  }
+
+  /**
+   * Initialize SubCollection from dom element
+   * 
+   * @param collection
+   */
+  public void initialize(Element collection) {
+    this.name = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_NAME).item(0)).trim();
+    this.wlString = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
+    this.blString = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_BLACKLIST).item(0)).trim();
+
+    parseList(this.whiteList, wlString);
+    parseList(this.blackList, blString);
+  }
+
+  /**
+   * Create a list of patterns from chunk of text, patterns are separated with
+   * newline
+   * 
+   * @param list
+   * @param text
+   */
+  protected void parseList(ArrayList list, String text) {
+    list.clear();
+
+    StringTokenizer st = new StringTokenizer(text, "\n\r");
+
+    while (st.hasMoreElements()) {
+      String line = (String) st.nextElement();
+      list.add(line.trim());
+    }
+  }
+
+  /**
+   * Set contents of blacklist from String
+   * 
+   * @param list the blacklist contents
+   */
+  public void setBlackList(String list) {
+    this.blString = list;
+    parseList(blackList, list);
+  }
+
+  /**
+   * Set contents of whitelist from String
+   * 
+   * @param list the whitelist contents
+   */
+  public void setWhiteList(String list) {
+    this.wlString = list;
+    parseList(whiteList, list);
+  }
+}

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html Mon Jun  5 13:12:48 2006
@@ -0,0 +1,36 @@
+<html>
+<body>
+<p>
+Subcollection is a subset of an index. Subcollections are defined
+by urlpatterns in form of white/blacklist. So to get the page into
+subcollection it must match the whitelist and not the blacklist.
+</p>
+<p>
+Subcollection definitions are read from a file subcollections.xml
+and the format is as follows (imagine here that you are crawling all
+the virtualhosts from apache.org and you wan't to tag pages with
+url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/
+to be part of subcollection "nutch", this allows you to later search
+specifically from this subcollection)
+</p>
+<p/>
+<p/>
+<pre>
+&lt;?xml version="1.0" encoding="UTF-8"?>
+&lt;subcollections>
+	&lt;subcollection>
+		&lt;name>nutch&lt;/name>
+		&lt;id>lucene&lt;/id>
+		&lt;whitelist>http://lucene.apache.org/nutch&lt;/whitelist>
+		&lt;whitelist>http://wiki.apache.org/nutch/&lt;/whitelist>
+		&lt;blacklist />
+	&lt;/subcollection>
+&lt;/subcollections>
+</pre>
+</p>
+<p>Despite of this configuration you still can crawl any urls
+as long as they pass through your global url filters. (note that
+you must also seed your urls in normal nutch way)
+</p>
+</body>
+</html>

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Mon Jun  5 13:12:48 2006
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+
+import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import java.util.logging.Logger;
+
+public class SubcollectionIndexingFilter extends Configured implements IndexingFilter {
+
+  public SubcollectionIndexingFilter(){
+    super(NutchConfiguration.create());
+  }
+  
+  public SubcollectionIndexingFilter(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * Doc field name
+   */
+  public static final String FIELD_NAME = "subcollection";
+
+  /**
+   * Logger
+   */
+  public static final Logger LOG = LogFormatter
+      .getLogger(SubcollectionIndexingFilter.class.getName());
+
+  /**
+   * "Mark" document to be a part of subcollection
+   * 
+   * @param doc
+   * @param url
+   */
+  private void addSubCollectionField(Document doc, String url) {
+    String collname = CollectionManager.getCollectionManager(getConf()).getSubCollections(url);
+    doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED));
+  }
+
+  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    String sUrl = url.toString();
+    addSubCollectionField(doc, sUrl);
+    return doc;
+  }
+}

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/searcher/subcollection/SubcollectionQueryFilter.java Mon Jun  5 13:12:48 2006
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.searcher.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.subcollection.SubcollectionIndexingFilter;
+import org.apache.nutch.searcher.RawFieldQueryFilter;
+
+/** Handles "collection:" query clauses, causing them to search the "collection" field
+ * indexed by SubcollectionINdexingFilter. */
+public class SubcollectionQueryFilter extends RawFieldQueryFilter {
+  public SubcollectionQueryFilter() {
+    super(SubcollectionIndexingFilter.FIELD_NAME);
+  }
+
+  public void setConf(Configuration conf) {
+    // nothing to configure
+  }
+
+  public Configuration getConf() {
+    // nothing configured
+    return null;
+  }
+}

Added: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java?rev=411904&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java (added)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/util/DomUtil.java Mon Jun  5 13:12:48 2006
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.Element;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class DomUtil {
+
+  /**
+   * Returns parsed dom tree or null if any error
+   * 
+   * @param is
+   * @return
+   */
+  public static Element getDom(InputStream is) {
+
+    Element element = null;
+
+    DOMParser parser = new DOMParser();
+
+    InputSource input;
+    try {
+      input = new InputSource(is);
+      input.setEncoding("UTF-8");
+      parser.parse(input);
+      element = (Element) parser.getDocument().getChildNodes().item(0);
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+    } catch (SAXException e) {
+      e.printStackTrace();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    return element;
+  }
+
+  /**
+   * save dom into ouputstream
+   * 
+   * @param os
+   * @param e
+   */
+  public static void saveDom(OutputStream os, Element e) {
+
+    DOMSource source = new DOMSource(e);
+    TransformerFactory transFactory = TransformerFactory.newInstance();
+    Transformer transformer;
+    try {
+      transformer = transFactory.newTransformer();
+      transformer.setOutputProperty("indent", "yes");
+      StreamResult result = new StreamResult(os);
+      transformer.transform(source, result);
+      os.flush();
+    } catch (UnsupportedEncodingException e1) {
+      e1.printStackTrace();
+    } catch (IOException e1) {
+      e1.printStackTrace();
+    } catch (TransformerConfigurationException e2) {
+      e2.printStackTrace();
+    } catch (TransformerException ex) {
+      ex.printStackTrace();
+    }
+  }
+}