You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/03/16 10:51:57 UTC

[nutch] branch master updated: NUTCH-2068 Allow subcollection overrides via metadata

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
       new  9fb7d6c   NUTCH-2068 Allow subcollection overrides via metadata
9fb7d6c is described below

commit 9fb7d6c2e61ce36375722b16842b694621f3b053
Author: Markus Jelsma <ma...@apache.org>
AuthorDate: Thu Mar 16 11:51:39 2017 +0100

    NUTCH-2068 Allow subcollection overrides via metadata
---
 .../subcollection/SubcollectionIndexingFilter.java      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index c4b8b31..df12e4f 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -57,6 +57,7 @@ public class SubcollectionIndexingFilter extends Configured implements
   public void setConf(Configuration conf) {
     this.conf = conf;
     fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+    metadataSource = conf.get("subcollection.metadata.source", "subcollection");
   }
 
   /**
@@ -70,6 +71,11 @@ public class SubcollectionIndexingFilter extends Configured implements
    * Doc field name
    */
   public static String fieldName = "subcollection";
+  
+  /**
+   * Metadata source field name
+   */
+  public static String metadataSource = "subcollection";
 
   /**
    * Logger
@@ -96,6 +102,17 @@ public class SubcollectionIndexingFilter extends Configured implements
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    // Check for subcollection overrride in HTML metadata
+    String subcollection = parse.getData().getMeta(metadataSource);
+    if (subcollection != null) {
+      subcollection = subcollection.trim();
+      
+      if (subcollection.length() > 0) {
+        doc.add(fieldName, subcollection);
+        return doc;
+      }
+    }
+    
     String sUrl = url.toString();
     addSubCollectionField(doc, sUrl);
     return doc;

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].