You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2017/03/16 10:51:57 UTC
[nutch] branch master updated: NUTCH-2068 Allow subcollection
overrides via metadata
This is an automated email from the ASF dual-hosted git repository.
markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 9fb7d6c NUTCH-2068 Allow subcollection overrides via metadata
9fb7d6c is described below
commit 9fb7d6c2e61ce36375722b16842b694621f3b053
Author: Markus Jelsma <ma...@apache.org>
AuthorDate: Thu Mar 16 11:51:39 2017 +0100
NUTCH-2068 Allow subcollection overrides via metadata
---
.../subcollection/SubcollectionIndexingFilter.java | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index c4b8b31..df12e4f 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -57,6 +57,7 @@ public class SubcollectionIndexingFilter extends Configured implements
public void setConf(Configuration conf) {
this.conf = conf;
fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+ metadataSource = conf.get("subcollection.metadata.source", "subcollection");
}
/**
@@ -70,6 +71,11 @@ public class SubcollectionIndexingFilter extends Configured implements
* Doc field name
*/
public static String fieldName = "subcollection";
+
+ /**
+ * Metadata source field name
+ */
+ public static String metadataSource = "subcollection";
/**
* Logger
@@ -96,6 +102,17 @@ public class SubcollectionIndexingFilter extends Configured implements
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ // Check for subcollection overrride in HTML metadata
+ String subcollection = parse.getData().getMeta(metadataSource);
+ if (subcollection != null) {
+ subcollection = subcollection.trim();
+
+ if (subcollection.length() > 0) {
+ doc.add(fieldName, subcollection);
+ return doc;
+ }
+ }
+
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;
--
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].