You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/07/30 10:55:25 UTC

svn commit: r1614586 - in /nutch/trunk: ./ conf/ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/ src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/

Author: jnioche
Date: Wed Jul 30 08:55:24 2014
New Revision: 1614586

URL: http://svn.apache.org/r1614586
Log:
NUTCH-1561 improve usability of parse-metatags and index-metadata

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
    nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 30 08:55:24 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1561 improve usability of parse-metatags and index-metadata (snagel)
+
 * NUTCH-1708 use same id when indexing and deleting redirects (snagel)
 
 * NUTCH-1818 Add deps-test-compile task for building plugins (jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jul 30 08:55:24 2014
@@ -1344,12 +1344,12 @@
 <!-- parse-metatags plugin properties -->
 <property>
   <name>metatags.names</name>
-  <value>description;keywords</value>
-  <description> Names of the metatags to extract, separated by;. 
+  <value>description,keywords</value>
+  <description> Names of the metatags to extract, separated by ','.
   Use '*' to extract all metatags. Prefixes the names with 'metatag.'
   in the parse-metadata. For instance to index description and keywords, 
   you need to activate the plugin index-metadata and set the value of the 
-  parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
+  parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
   </description>
 </property>
 

Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Wed Jul 30 08:55:24 2014
@@ -18,7 +18,8 @@
 package org.apache.nutch.indexer.metadata;
 
 import java.util.HashMap;
-import java.util.Map.Entry;
+import java.util.Locale;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
@@ -30,73 +31,74 @@ import org.apache.nutch.indexer.NutchDoc
 import org.apache.nutch.parse.Parse;
 
 /**
- * Indexer which can be configured to extract metadata from the crawldb, parse metadata or content metadata.
- * You can specify the properties "index.db", "index.parse" or "index.content" who's values are
- * comma-delimited <value>key1, key2, key3</value>.
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db.md",
+ * "index.parse.md" or "index.content.md" who's values are comma-delimited
+ * <value>key1,key2,key3</value>.
  */
-
 public class MetadataIndexer implements IndexingFilter {
-	private Configuration conf;
-	private HashMap<String, String[]> staticfields;
-	private static String[] dbFieldnames;
-	private static String[] parseFieldnames;
-	private static String[] contentFieldnames;
-	private static final String db_CONF_PROPERTY = "index.db.md";
-	private static final String parse_CONF_PROPERTY = "index.parse.md";
-	private static final String content_CONF_PROPERTY = "index.content.md";
-
-	public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-			CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-		// just in case
-		if (doc == null)
-			return doc;
-
-		// add the fields from crawldb
-		if (dbFieldnames != null) {
-			for (String metatag : dbFieldnames) {
-				Text metadata = (Text) datum.getMetaData().get(
-						new Text(metatag));
-				if (metadata != null)
-					doc.add(metatag, metadata.toString());
-			}
-		}
-
-		// add the fields from parsemd
-		if (parseFieldnames != null) {
-			for (String metatag : parseFieldnames) {
-				for (String value : parse.getData().getParseMeta().getValues(metatag)) {
-					if (value != null)
-						doc.add(metatag, value);
-				}
-			}
-		}
-
-		// add the fields from contentmd
-		if (contentFieldnames != null) {
-			for (String metatag : contentFieldnames) {
-				for (String value : parse.getData().getContentMeta().getValues(metatag)) {
-					if (value != null)
-						doc.add(metatag, value);
-				}
-			}
-		}
-
-		return doc;
-	}
-
-	public void setConf(Configuration conf) {
-		this.conf = conf;
-		dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
-		parseFieldnames = conf.getStrings(parse_CONF_PROPERTY);
-		contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
-
-		// TODO check conflict between field names e.g. could have same label
-		// from different sources
-
-	}
-
-	public Configuration getConf() {
-		return this.conf;
-	}
+  private Configuration conf;
+  private String[] dbFieldnames;
+  private Map<String, String> parseFieldnames;
+  private String[] contentFieldnames;
+  private static final String db_CONF_PROPERTY = "index.db.md";
+  private static final String parse_CONF_PROPERTY = "index.parse.md";
+  private static final String content_CONF_PROPERTY = "index.content.md";
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // just in case
+    if (doc == null)
+      return doc;
+
+    // add the fields from crawldb
+    if (dbFieldnames != null) {
+      for (String metatag : dbFieldnames) {
+        Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+        if (metadata != null)
+          doc.add(metatag, metadata.toString());
+      }
+    }
+
+    // add the fields from parsemd
+    if (parseFieldnames != null) {
+      for (String metatag : parseFieldnames.keySet()) {
+        for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(parseFieldnames.get(metatag), value);
+        }
+      }
+    }
+
+    // add the fields from contentmd
+    if (contentFieldnames != null) {
+      for (String metatag : contentFieldnames) {
+        for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(metatag, value);
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
+    parseFieldnames = new HashMap<String, String>();
+    for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
+      parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
+    }
+    contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
+
+    // TODO check conflict between field names e.g. could have same label
+    // from different sources
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
 }

Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java Wed Jul 30 08:55:24 2014
@@ -18,6 +18,7 @@ package org.apache.nutch.parse.metatags;
 
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.Locale;
 import java.util.Properties;
 import java.util.Set;
 
@@ -35,7 +36,7 @@ import org.w3c.dom.DocumentFragment;
 /**
  * Parse HTML meta tags (keywords, description) and store them in the parse
  * metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'
+ * prefix 'metatag.'. Metatags are matched ignoring case.
  */
 public class MetaTagsParser implements HtmlParseFilter {
 
@@ -50,16 +51,48 @@ public class MetaTagsParser implements H
     this.conf = conf;
     // specify whether we want a specific subset of metadata
     // by default take everything we can find
-    String metatags = conf.get("metatags.names", "*");
-    String[] values = metatags.split(";");
-    for (String val : values)
-      metatagset.add(val.toLowerCase());
+    String[] values = conf.getStrings("metatags.names", "*");
+    for (String val : values) {
+      metatagset.add(val.toLowerCase(Locale.ROOT));
+    }
   }
 
   public Configuration getConf() {
     return this.conf;
   }
 
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String value) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+      }
+      metadata.add("metatag." + lcMetatag, value);
+    }
+  }
+
+  /**
+   * Check whether the metatag is in the list of metatags to be indexed (or if
+   * '*' is specified). If yes, add it with all values to parse metadata.
+   */
+  private void addIndexedMetatags(Metadata metadata, String metatag,
+      String[] values) {
+    String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+    if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      for (String value : values) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        }
+        metadata.add("metatag." + lcMetatag, value);
+      }
+    }
+  }
+
   public ParseResult filter(Content content, ParseResult parseResult,
       HTMLMetaTags metaTags, DocumentFragment doc) {
 
@@ -68,42 +101,21 @@ public class MetaTagsParser implements H
 
     // check in the metadata first : the tika-parser
     // might have stored the values there already
-
     for (String mdName : metadata.names()) {
-      String value = metadata.get(mdName);
-      // check whether the name is in the list of what we want or if
-      // specified *
-      if (metatagset.contains("*") || metatagset.contains(mdName.toLowerCase())) {
-        LOG.debug("Found meta tag : " + mdName + "\t" + value);
-        metadata.add("metatag." + mdName.toLowerCase(), value);
-      }
+      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
     }
 
     Metadata generalMetaTags = metaTags.getGeneralTags();
-    for (String tagName : generalMetaTags.names() ) {
-    String[] tagValues = generalMetaTags.getValues(tagName);    
-  
-      for ( String tagValue : tagValues ) {
-      // check whether the name is in the list of what we want or if
-      // specified *
-    	 if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
-    		 LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
-    		 metadata.add("metatag." + tagName.toLowerCase(), tagValue);
-    	 }
-      }
+    for (String tagName : generalMetaTags.names()) {
+      addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
     }
 
     Properties httpequiv = metaTags.getHttpEquivTags();
-    for (Enumeration tagNames = httpequiv.propertyNames(); tagNames
+    for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
         .hasMoreElements();) {
       String name = (String) tagNames.nextElement();
       String value = httpequiv.getProperty(name);
-      // check whether the name is in the list of what we want or if
-      // specified *
-      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
-        LOG.debug("Found meta tag : " + name + "\t" + value);
-        metadata.add("metatag." + name.toLowerCase(), value);
-      }
+      addIndexedMetatags(metadata, name, value);
     }
 
     return parseResult;

Modified: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java Wed Jul 30 08:55:24 2014
@@ -34,18 +34,18 @@ import org.junit.Assert;
 import org.junit.Test;
 
 public class TestMetatagParser {
-  
+
   private String fileSeparator = System.getProperty("file.separator");
   private String sampleDir = System.getProperty("test.data", ".");
   private String sampleFile = "testMetatags.html";
   private String sampleFileMultival = "testMultivalueMetatags.html";
   private String description = "This is a test of description";
   private String keywords = "This is a test of keywords";
-  
+
   public Metadata parseMeta(String fileName, Configuration conf) {
     Metadata metadata = null;
     try {
-      String urlString = "file:" + sampleDir + fileSeparator + fileName;     
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
       Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
       Content content = protocol.getProtocolOutput(new Text(urlString),
           new CrawlDatum()).getContent();
@@ -59,43 +59,46 @@ public class TestMetatagParser {
   }
 
   @Test
+  /** test defaults: keywords and description */
   public void testIt() {
     Configuration conf = NutchConfiguration.create();
-    
+
     // check that we get the same values
-    Metadata parseMeta= parseMeta(sampleFile, conf);
-      
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
     Assert.assertEquals(description, parseMeta.get("metatag.description"));
     Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
   }
 
   @Test
+  /** test multiple metatags resulting in metadata with multiple values */
   public void testMultiValueMetatags() {
     Configuration conf = NutchConfiguration.create();
-    conf.set("metatags.names", "keywords;DC.creator");
+    conf.set("metatags.names", "keywords,DC.creator");
     conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
 
     Metadata parseMeta = parseMeta(sampleFileMultival, conf);
-    
+
     String failMessage = "One value of metatag with multiple values is missing: ";
 
     Set<String> valueSet = new TreeSet<String>();
     for (String val : parseMeta.getValues("metatag.dc.creator")) {
       valueSet.add(val);
     }
-    String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
+    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
     for (String val : expectedValues1) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));      
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
     }
-    
+
     valueSet.clear();
     for (String val : parseMeta.getValues("metatag.keywords")) {
       valueSet.add(val);
     }
-    String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
+    String[] expectedValues2 = { "robot d'indexation", "web crawler",
+        "Webcrawler" };
     for (String val : expectedValues2) {
-      Assert.assertTrue(failMessage + val, valueSet.contains(val));      
+      Assert.assertTrue(failMessage + val, valueSet.contains(val));
     }
   }
-  
+
 }