You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/11/07 08:23:56 UTC

[nutch] branch master updated: NUTCH-1559 parse-metatags duplicates extracted metatags - do not add metatags already in ParseData's as this may lead to duplicates - add unit test - fix logging in MetaTagsParser to use slf4j

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 497936b  NUTCH-1559 parse-metatags duplicates extracted metatags - do not add metatags already in ParseData's as this may lead to duplicates - add unit test - fix logging in MetaTagsParser to use slf4j
     new b554145  Merge pull request #481 from sebastian-nagel/NUTCH-1559-dupl-metatags
497936b is described below

commit 497936b37751f6716da372611bf34d777b88d14d
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Wed Oct 16 16:25:36 2019 +0200

    NUTCH-1559 parse-metatags duplicates extracted metatags
    - do not add metatags already in ParseData's as this may lead to duplicates
    - add unit test
    - fix logging in MetaTagsParser to use slf4j
---
 .../nutch/parse/metatags/MetaTagsParser.java       | 22 ++++++++---------
 .../nutch/parse/metatags/TestMetatagParser.java    | 28 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 51bf25c..8deaf18 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -23,8 +23,6 @@ import java.util.Locale;
 import java.util.Properties;
 import java.util.Set;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
@@ -32,6 +30,8 @@ import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
 
 /**
@@ -42,7 +42,7 @@ import org.w3c.dom.DocumentFragment;
 public class MetaTagsParser implements HtmlParseFilter {
 
   private static final Logger LOG = LoggerFactory
-		      .getLogger(MethodHandles.lookup().lookupClass());
+      .getLogger(MethodHandles.lookup().lookupClass());
 
   private Configuration conf;
 
@@ -71,7 +71,7 @@ public class MetaTagsParser implements HtmlParseFilter {
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
       }
       metadata.add("metatag." + lcMetatag, value);
     }
@@ -85,11 +85,12 @@ public class MetaTagsParser implements HtmlParseFilter {
       String[] values) {
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      String key = "metatag." + lcMetatag;
       for (String value : values) {
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+          LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
         }
-        metadata.add("metatag." + lcMetatag, value);
+        metadata.add(key, value);
       }
     }
   }
@@ -100,11 +101,10 @@ public class MetaTagsParser implements HtmlParseFilter {
     Parse parse = parseResult.get(content.getUrl());
     Metadata metadata = parse.getData().getParseMeta();
 
-    // check in the metadata first : the tika-parser
-    // might have stored the values there already
-    for (String mdName : metadata.names()) {
-      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
-    }
+    /*
+     * NUTCH-1559: do not extract meta values from ParseData's metadata to avoid
+     * duplicate metatag values
+     */
 
     Metadata generalMetaTags = metaTags.getGeneralTags();
     for (String tagName : generalMetaTags.names()) {
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 3f03af9..5702c10 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.parse.metatags;
 
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.TreeSet;
 
@@ -31,6 +33,8 @@ import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class TestMetatagParser {
 
@@ -41,6 +45,9 @@ public class TestMetatagParser {
   private String description = "This is a test of description";
   private String keywords = "This is a test of keywords";
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   public Metadata parseMeta(String fileName, Configuration conf) {
     Metadata metadata = null;
     try {
@@ -100,4 +107,25 @@ public class TestMetatagParser {
     }
   }
 
+  @Test
+  public void testDuplicatedMetatags() {
+    String[] parsePlugins = { "parse-html", "parse-tika" };
+
+    for (String parsePlugin : parsePlugins) {
+
+      Configuration conf = NutchConfiguration.create();
+      conf.set("plugin.includes",
+          "protocol-file|parse-metatags|" + parsePlugin);
+      conf.set("metatags.names", "keywords");
+      conf.set("index.parse.md", "metatag.keywords");
+
+      Metadata parseMeta = parseMeta(sampleFile, conf);
+
+      LOG.info("metatags ({}): {}", parsePlugin,
+          Arrays.toString(parseMeta.getValues("metatag.keywords")));
+      Assert.assertEquals(
+          "Test document contains a single value of <meta name=keywords>, metatag.keywords should be also single-valued",
+          1, parseMeta.getValues("metatag.keywords").length);
+    }
+  }
 }