You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2013/06/13 22:45:37 UTC
svn commit: r1492856 - in /nutch/trunk: ./ src/java/org/apache/nutch/parse/
src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-metatags/ src/plugin/parse-me...
Author: snagel
Date: Thu Jun 13 20:45:37 2013
New Revision: 1492856
URL: http://svn.apache.org/r1492856
Log:
NUTCH-1467 and NUTCH-1560: add all values of multi-valued metatags
Added:
nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html (with props)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
nutch/trunk/src/plugin/parse-metatags/build.xml
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 13 20:45:37 2013
@@ -2,6 +2,10 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1560 index-metadata to add all values of multivalued metadata (snagel)
+
+* NUTCH-1467 Not able to parse mutliValued metatags (kiran via snagel)
+
* NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule (markus)
* NUTCH-1522 Upgrade to Tika 1.3 (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Thu Jun 13 20:45:37 2013
@@ -21,6 +21,8 @@ import java.net.URL;
import java.util.Iterator;
import java.util.Properties;
+import org.apache.nutch.metadata.Metadata;
+
/**
* This class holds the information about HTML "meta" tags extracted from
* a page. Some special tags have convenience methods for easy checking.
@@ -40,10 +42,10 @@ public class HTMLMetaTags {
private URL refreshHref = null;
- private Properties generalTags = new Properties();
+ private Metadata generalTags = new Metadata();
private Properties httpEquivTags = new Properties();
-
+
/**
* Sets all boolean values to <code>false</code>. Clears all other tags.
*/
@@ -166,7 +168,7 @@ public class HTMLMetaTags {
* Returns all collected values of the general meta tags. Property names are
* tag names, property values are "content" values.
*/
- public Properties getGeneralTags() {
+ public Metadata getGeneralTags() {
return generalTags;
}
@@ -188,12 +190,13 @@ public class HTMLMetaTags {
+ ", refreshHref=" + refreshHref + "\n"
);
sb.append(" * general tags:\n");
- Iterator<Object> it = generalTags.keySet().iterator();
- while (it.hasNext()) {
- String key = (String)it.next();
+ String[] names = generalTags.names();
+ for (String name : names) {
+ String key = name;
sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n");
}
sb.append(" * http-equiv tags:\n");
+ Iterator<Object> it = httpEquivTags.keySet().iterator();
it = httpEquivTags.keySet().iterator();
while (it.hasNext()) {
String key = (String)it.next();
Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Thu Jun 13 20:45:37 2013
@@ -65,18 +65,20 @@ public class MetadataIndexer implements
// add the fields from parsemd
if (parseFieldnames != null) {
for (String metatag : parseFieldnames) {
- String value = parse.getData().getParseMeta().get(metatag);
- if (value != null)
- doc.add(metatag, value);
+ for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+ if (value != null)
+ doc.add(metatag, value);
+ }
}
}
// add the fields from contentmd
if (contentFieldnames != null) {
for (String metatag : contentFieldnames) {
- String value = parse.getData().getContentMeta().get(metatag);
- if (value != null)
- doc.add(metatag, value);
+ for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+ if (value != null)
+ doc.add(metatag, value);
+ }
}
}
Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Thu Jun 13 20:45:37 2013
@@ -78,7 +78,7 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
Modified: nutch/trunk/src/plugin/parse-metatags/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/build.xml?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/build.xml (original)
+++ nutch/trunk/src/plugin/parse-metatags/build.xml Thu Jun 13 20:45:37 2013
@@ -28,6 +28,10 @@
<!-- for junit test -->
<mkdir dir="${build.test}/data" />
- <copy file="sample/testMetatags.html" todir="${build.test}/data" />
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.html" />
+ </fileset>
+ </copy>
</project>
Added: nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html?rev=1492856&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html (added)
+++ nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html Thu Jun 13 20:45:37 2013
@@ -0,0 +1,12 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
\ No newline at end of file
Propchange: nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
------------------------------------------------------------------------------
svn:eol-style = native
Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java Thu Jun 13 20:45:37 2013
@@ -75,16 +75,17 @@ public class MetaTagsParser implements H
}
}
- Properties generalMetaTags = metaTags.getGeneralTags();
- for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
- .hasMoreElements();) {
- String name = (String) tagNames.nextElement();
- String value = generalMetaTags.getProperty(name);
+ Metadata generalMetaTags = metaTags.getGeneralTags();
+ for (String tagName : generalMetaTags.names() ) {
+ String[] tagValues = generalMetaTags.getValues(tagName);
+
+ for ( String tagValue : tagValues ) {
// check whether the name is in the list of what we want or if
// specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
- LOG.debug("Found meta tag : " + name + "\t" + value);
- metadata.add("metatag." + name.toLowerCase(), value);
+ if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
+ LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
+ metadata.add("metatag." + tagName.toLowerCase(), tagValue);
+ }
}
}
Modified: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java Thu Jun 13 20:45:37 2013
@@ -17,6 +17,9 @@
package org.apache.nutch.parse.html;
+import java.util.Set;
+import java.util.TreeSet;
+
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
@@ -35,6 +38,7 @@ public class TestMetatagParser extends T
private String fileSeparator = System.getProperty("file.separator");
private String sampleDir = System.getProperty("test.data", ".");
private String sampleFile = "testMetatags.html";
+ private String sampleFileMultival = "testMultivalueMetatags.html";
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
@@ -42,27 +46,58 @@ public class TestMetatagParser extends T
super(name);
}
- public void testIt() {
- Configuration conf = NutchConfiguration.create();
-
- String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
-
+ public Metadata parseMeta(String fileName, Configuration conf) {
+ Metadata metadata = null;
try {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
-
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
- // check that we get the same values
- Metadata parseMeta = parse.getData().getParseMeta();
-
- assertEquals(description, parseMeta.get("metatag.description"));
- assertEquals(keywords, parseMeta.get("metatag.keywords"));
+ metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
+ return metadata;
+ }
+
+ public void testIt() {
+ Configuration conf = NutchConfiguration.create();
+
+ // check that we get the same values
+ Metadata parseMeta= parseMeta(sampleFile, conf);
+
+ assertEquals(description, parseMeta.get("metatag.description"));
+ assertEquals(keywords, parseMeta.get("metatag.keywords"));
+ }
+
+ public void testMultiValueMetatags() {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("metatags.names", "keywords;DC.creator");
+ conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+ Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+ String failMessage = "One value of metatag with multiple values is missing: ";
+
+ Set<String> valueSet = new TreeSet<String>();
+ for (String val : parseMeta.getValues("metatag.dc.creator")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
+ for (String val : expectedValues1) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
+
+ valueSet.clear();
+ for (String val : parseMeta.getValues("metatag.keywords")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
+ for (String val : expectedValues2) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
}
}
Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Thu Jun 13 20:45:37 2013
@@ -49,7 +49,7 @@ public class HTMLMetaProcessor {
private static final void getMetaTagsHelper(
HTMLMetaTags metaTags, Node node, URL currURL) {
-
+
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) {
@@ -77,8 +77,8 @@ public class HTMLMetaProcessor {
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {