You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2016/06/30 06:44:57 UTC

nutch git commit: NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html - add general metadata to parse metadata where it can be checked by the indexer

Repository: nutch
Updated Branches:
  refs/heads/master d96c936b6 -> cb6fbae51


NUTCH-1553 Property 'indexer.delete.robots.noindex' not working when using parser-html
- add general metadata to parse metadata where it can be checked by the indexer


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/cb6fbae5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/cb6fbae5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/cb6fbae5

Branch: refs/heads/master
Commit: cb6fbae51a56587c30d15b8f170ebbf470851168
Parents: d96c936
Author: Sebastian Nagel <sn...@apache.org>
Authored: Thu Jun 30 08:12:02 2016 +0200
Committer: Sebastian Nagel <sn...@apache.org>
Committed: Thu Jun 30 08:40:33 2016 +0200

----------------------------------------------------------------------
 .../src/java/org/apache/nutch/parse/html/HtmlParser.java       | 6 ++++++
 1 file changed, 6 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/cb6fbae5/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index ecf2f12..b6666aa 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -181,6 +181,12 @@ public class HtmlParser implements Parser {
 
     // get meta directives
     HTMLMetaProcessor.getMetaTags(metaTags, root, base);
+
+    // populate Nutch metadata with HTML meta directives
+    for (String name : metaTags.getGeneralTags().names()) {
+      metadata.add(name, metaTags.getGeneralTags().get(name));
+    }
+
     if (LOG.isTraceEnabled()) {
       LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
     }