You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/07/30 10:55:25 UTC
svn commit: r1614586 - in /nutch/trunk: ./ conf/
src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/
src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/
src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/
Author: jnioche
Date: Wed Jul 30 08:55:24 2014
New Revision: 1614586
URL: http://svn.apache.org/r1614586
Log:
NUTCH-1561 improve usability of parse-metatags and index-metadata
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 30 08:55:24 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1561 improve usability of parse-metatags and index-metadata (snagel)
+
* NUTCH-1708 use same id when indexing and deleting redirects (snagel)
* NUTCH-1818 Add deps-test-compile task for building plugins (jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jul 30 08:55:24 2014
@@ -1344,12 +1344,12 @@
<!-- parse-metatags plugin properties -->
<property>
<name>metatags.names</name>
- <value>description;keywords</value>
- <description> Names of the metatags to extract, separated by;.
+ <value>description,keywords</value>
+ <description> Names of the metatags to extract, separated by ','.
Use '*' to extract all metatags. Prefixes the names with 'metatag.'
in the parse-metadata. For instance to index description and keywords,
you need to activate the plugin index-metadata and set the value of the
- parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
+ parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
</description>
</property>
Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Wed Jul 30 08:55:24 2014
@@ -18,7 +18,8 @@
package org.apache.nutch.indexer.metadata;
import java.util.HashMap;
-import java.util.Map.Entry;
+import java.util.Locale;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -30,73 +31,74 @@ import org.apache.nutch.indexer.NutchDoc
import org.apache.nutch.parse.Parse;
/**
- * Indexer which can be configured to extract metadata from the crawldb, parse metadata or content metadata.
- * You can specify the properties "index.db", "index.parse" or "index.content" who's values are
- * comma-delimited <value>key1, key2, key3</value>.
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db.md",
+ * "index.parse.md" or "index.content.md" who's values are comma-delimited
+ * <value>key1,key2,key3</value>.
*/
-
public class MetadataIndexer implements IndexingFilter {
- private Configuration conf;
- private HashMap<String, String[]> staticfields;
- private static String[] dbFieldnames;
- private static String[] parseFieldnames;
- private static String[] contentFieldnames;
- private static final String db_CONF_PROPERTY = "index.db.md";
- private static final String parse_CONF_PROPERTY = "index.parse.md";
- private static final String content_CONF_PROPERTY = "index.content.md";
-
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- // just in case
- if (doc == null)
- return doc;
-
- // add the fields from crawldb
- if (dbFieldnames != null) {
- for (String metatag : dbFieldnames) {
- Text metadata = (Text) datum.getMetaData().get(
- new Text(metatag));
- if (metadata != null)
- doc.add(metatag, metadata.toString());
- }
- }
-
- // add the fields from parsemd
- if (parseFieldnames != null) {
- for (String metatag : parseFieldnames) {
- for (String value : parse.getData().getParseMeta().getValues(metatag)) {
- if (value != null)
- doc.add(metatag, value);
- }
- }
- }
-
- // add the fields from contentmd
- if (contentFieldnames != null) {
- for (String metatag : contentFieldnames) {
- for (String value : parse.getData().getContentMeta().getValues(metatag)) {
- if (value != null)
- doc.add(metatag, value);
- }
- }
- }
-
- return doc;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
- parseFieldnames = conf.getStrings(parse_CONF_PROPERTY);
- contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
-
- // TODO check conflict between field names e.g. could have same label
- // from different sources
-
- }
-
- public Configuration getConf() {
- return this.conf;
- }
+ private Configuration conf;
+ private String[] dbFieldnames;
+ private Map<String, String> parseFieldnames;
+ private String[] contentFieldnames;
+ private static final String db_CONF_PROPERTY = "index.db.md";
+ private static final String parse_CONF_PROPERTY = "index.parse.md";
+ private static final String content_CONF_PROPERTY = "index.content.md";
+
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ // just in case
+ if (doc == null)
+ return doc;
+
+ // add the fields from crawldb
+ if (dbFieldnames != null) {
+ for (String metatag : dbFieldnames) {
+ Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+ if (metadata != null)
+ doc.add(metatag, metadata.toString());
+ }
+ }
+
+ // add the fields from parsemd
+ if (parseFieldnames != null) {
+ for (String metatag : parseFieldnames.keySet()) {
+ for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+ if (value != null)
+ doc.add(parseFieldnames.get(metatag), value);
+ }
+ }
+ }
+
+ // add the fields from contentmd
+ if (contentFieldnames != null) {
+ for (String metatag : contentFieldnames) {
+ for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+ if (value != null)
+ doc.add(metatag, value);
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
+ parseFieldnames = new HashMap<String, String>();
+ for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
+ parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
+ }
+ contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
+
+ // TODO check conflict between field names e.g. could have same label
+ // from different sources
+
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
}
Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java Wed Jul 30 08:55:24 2014
@@ -18,6 +18,7 @@ package org.apache.nutch.parse.metatags;
import java.util.Enumeration;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Properties;
import java.util.Set;
@@ -35,7 +36,7 @@ import org.w3c.dom.DocumentFragment;
/**
* Parse HTML meta tags (keywords, description) and store them in the parse
* metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'
+ * prefix 'metatag.'. Metatags are matched ignoring case.
*/
public class MetaTagsParser implements HtmlParseFilter {
@@ -50,16 +51,48 @@ public class MetaTagsParser implements H
this.conf = conf;
// specify whether we want a specific subset of metadata
// by default take everything we can find
- String metatags = conf.get("metatags.names", "*");
- String[] values = metatags.split(";");
- for (String val : values)
- metatagset.add(val.toLowerCase());
+ String[] values = conf.getStrings("metatags.names", "*");
+ for (String val : values) {
+ metatagset.add(val.toLowerCase(Locale.ROOT));
+ }
}
public Configuration getConf() {
return this.conf;
}
+ /**
+ * Check whether the metatag is in the list of metatags to be indexed (or if
+ * '*' is specified). If yes, add it to parse metadata.
+ */
+ private void addIndexedMetatags(Metadata metadata, String metatag,
+ String value) {
+ String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+ if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ }
+ metadata.add("metatag." + lcMetatag, value);
+ }
+ }
+
+ /**
+ * Check whether the metatag is in the list of metatags to be indexed (or if
+ * '*' is specified). If yes, add it with all values to parse metadata.
+ */
+ private void addIndexedMetatags(Metadata metadata, String metatag,
+ String[] values) {
+ String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+ if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ for (String value : values) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ }
+ metadata.add("metatag." + lcMetatag, value);
+ }
+ }
+ }
+
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -68,42 +101,21 @@ public class MetaTagsParser implements H
// check in the metadata first : the tika-parser
// might have stored the values there already
-
for (String mdName : metadata.names()) {
- String value = metadata.get(mdName);
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(mdName.toLowerCase())) {
- LOG.debug("Found meta tag : " + mdName + "\t" + value);
- metadata.add("metatag." + mdName.toLowerCase(), value);
- }
+ addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
}
Metadata generalMetaTags = metaTags.getGeneralTags();
- for (String tagName : generalMetaTags.names() ) {
- String[] tagValues = generalMetaTags.getValues(tagName);
-
- for ( String tagValue : tagValues ) {
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
- LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
- metadata.add("metatag." + tagName.toLowerCase(), tagValue);
- }
- }
+ for (String tagName : generalMetaTags.names()) {
+ addIndexedMetatags(metadata, tagName, generalMetaTags.getValues(tagName));
}
Properties httpequiv = metaTags.getHttpEquivTags();
- for (Enumeration tagNames = httpequiv.propertyNames(); tagNames
+ for (Enumeration<?> tagNames = httpequiv.propertyNames(); tagNames
.hasMoreElements();) {
String name = (String) tagNames.nextElement();
String value = httpequiv.getProperty(name);
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
- LOG.debug("Found meta tag : " + name + "\t" + value);
- metadata.add("metatag." + name.toLowerCase(), value);
- }
+ addIndexedMetatags(metadata, name, value);
}
return parseResult;
Modified: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java?rev=1614586&r1=1614585&r2=1614586&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java Wed Jul 30 08:55:24 2014
@@ -34,18 +34,18 @@ import org.junit.Assert;
import org.junit.Test;
public class TestMetatagParser {
-
+
private String fileSeparator = System.getProperty("file.separator");
private String sampleDir = System.getProperty("test.data", ".");
private String sampleFile = "testMetatags.html";
private String sampleFileMultival = "testMultivalueMetatags.html";
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
-
+
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
@@ -59,43 +59,46 @@ public class TestMetatagParser {
}
@Test
+ /** test defaults: keywords and description */
public void testIt() {
Configuration conf = NutchConfiguration.create();
-
+
// check that we get the same values
- Metadata parseMeta= parseMeta(sampleFile, conf);
-
+ Metadata parseMeta = parseMeta(sampleFile, conf);
+
Assert.assertEquals(description, parseMeta.get("metatag.description"));
Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
}
@Test
+ /** test multiple metatags resulting in metadata with multiple values */
public void testMultiValueMetatags() {
Configuration conf = NutchConfiguration.create();
- conf.set("metatags.names", "keywords;DC.creator");
+ conf.set("metatags.names", "keywords,DC.creator");
conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
Metadata parseMeta = parseMeta(sampleFileMultival, conf);
-
+
String failMessage = "One value of metatag with multiple values is missing: ";
Set<String> valueSet = new TreeSet<String>();
for (String val : parseMeta.getValues("metatag.dc.creator")) {
valueSet.add(val);
}
- String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
+ String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
for (String val : expectedValues1) {
- Assert.assertTrue(failMessage + val, valueSet.contains(val));
+ Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
-
+
valueSet.clear();
for (String val : parseMeta.getValues("metatag.keywords")) {
valueSet.add(val);
}
- String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
+ String[] expectedValues2 = { "robot d'indexation", "web crawler",
+ "Webcrawler" };
for (String val : expectedValues2) {
- Assert.assertTrue(failMessage + val, valueSet.contains(val));
+ Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
}
-
+
}