You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/02/09 10:55:09 UTC

svn commit: r1242255 - in /nutch/trunk: ./ src/plugin/subcollection/src/java/org/apache/nutch/collection/ src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/

Author: markus
Date: Thu Feb  9 09:55:08 2012
New Revision: 1242255

URL: http://svn.apache.org/viewvc?rev=1242255&view=rev
Log:
NUTCH-1266 Subcollection to optionally write to configured fields

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
    nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
    nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb  9 09:55:08 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1266 Subcollection to optionally write to configured fields (markus)
+
 * NUTCH-1005 Parse headings plugin (markus)
 
 * NUTCH-1264 Configurable indexing plugin index-metadata (jnioche)

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Thu Feb  9 09:55:08 2012
@@ -165,25 +165,25 @@ public class CollectionManager extends C
 
   /**
    * Return names of collections url is part of
-   * 
+   *
    * @param url
    *          The url to test against Collections
-   * @return Space delimited string of collection names url is part of
+   * @return Subcollections
    */
-  public List<String> getSubCollections(final String url) {
-    List<String> collections = new ArrayList<String>();
+  public List<Subcollection> getSubCollections(final String url) {
+    List<Subcollection> collections = new ArrayList<Subcollection>();
     final Iterator iterator = collectionMap.values().iterator();
 
     while (iterator.hasNext()) {
       final Subcollection subCol = (Subcollection) iterator.next();
       if (subCol.filter(url) != null) {
-        collections.add(subCol.name);
+        collections.add(subCol);
       }
     }
-    if (LOG.isTraceEnabled()) { 
-      LOG.trace("subcollections:" + Arrays.toString(collections.toArray())); 
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
     }
-    
+
     return collections;
   }
 

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Thu Feb  9 09:55:08 2012
@@ -38,6 +38,7 @@ public class Subcollection extends Confi
   public static final String TAG_WHITELIST="whitelist";
   public static final String TAG_BLACKLIST="blacklist";
   public static final String TAG_NAME="name";
+  public static final String TAG_KEY="key";
   public static final String TAG_ID="id";
 
   ArrayList blackList = new ArrayList();
@@ -49,6 +50,11 @@ public class Subcollection extends Confi
    */
   String id;
 
+  /**
+   * SubCollection key
+   */
+  String key;
+
   /** 
    * SubCollection name
    */
@@ -70,8 +76,18 @@ public class Subcollection extends Confi
    * @param name name of SubCollection
    */
   public Subcollection(String id, String name, Configuration conf) {
+    this(id, name, null, conf);
+  }
+
+  /** public Constructor
+   *
+   * @param id id of SubCollection
+   * @param name name of SubCollection
+   */
+  public Subcollection(String id, String name, String key, Configuration conf) {
     this(conf);
     this.id=id;
+    this.key = key;
     this.name = name;
   }
 
@@ -87,6 +103,13 @@ public class Subcollection extends Confi
   }
 
   /**
+   * @return Returns the key
+   */
+  public String getKey() {
+    return key;
+  }
+
+  /**
    * @return Returns the id
    */
   public String getId() {
@@ -180,6 +203,12 @@ public class Subcollection extends Confi
       this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
       parseList(this.blackList, blString);
     }
+
+    // Check if there's a key element or set default name
+    nodeList = collection.getElementsByTagName(TAG_KEY);
+    if (nodeList.getLength() == 1) {
+      this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
+    }
   }
 
   /**

Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Thu Feb  9 09:55:08 2012
@@ -31,6 +31,7 @@ import org.apache.nutch.indexer.Indexing
 import org.apache.nutch.indexer.NutchDocument;
 
 import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.collection.Subcollection;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
 
@@ -62,8 +63,12 @@ public class SubcollectionIndexingFilter
    * @param url
    */
   private void addSubCollectionField(NutchDocument doc, String url) {
-    for (String collname: CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
-      doc.add(FIELD_NAME, collname);
+    for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+      if (coll.getKey() == null) {
+        doc.add(FIELD_NAME, coll.getName());
+      } else {
+        doc.add(coll.getKey(), coll.getName());
+      }
     }
   }