You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/06/30 06:44:57 UTC
nutch git commit: NUTCH-1553 Property 'indexer.delete.robots.noindex'
not working when using parser-html - add general metadata to parse metadata
where it can be checked by the indexer
Repository: nutch
Updated Branches:
refs/heads/master d96c936b6 -> cb6fbae51
NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html
- add general metadata to parse metadata where it can be checked by the indexer
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/cb6fbae5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/cb6fbae5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/cb6fbae5
Branch: refs/heads/master
Commit: cb6fbae51a56587c30d15b8f170ebbf470851168
Parents: d96c936
Author: Sebastian Nagel <sn...@apache.org>
Authored: Thu Jun 30 08:12:02 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Thu Jun 30 08:40:33 2016 +0200
----------------------------------------------------------------------
.../src/java/org/apache/nutch/parse/html/HtmlParser.java | 6 ++++++
1 file changed, 6 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/cb6fbae5/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index ecf2f12..b6666aa 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -181,6 +181,12 @@ public class HtmlParser implements Parser {
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+
+ // populate Nutch metadata with HTML meta directives
+ for (String name : metaTags.getGeneralTags().names()) {
+ metadata.add(name, metaTags.getGeneralTags().get(name));
+ }
+
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}