You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2013/06/13 22:45:37 UTC

svn commit: r1492856 - in /nutch/trunk: ./ src/java/org/apache/nutch/parse/ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-metatags/ src/plugin/parse-me...

Author: snagel
Date: Thu Jun 13 20:45:37 2013
New Revision: 1492856

URL: http://svn.apache.org/r1492856
Log:
NUTCH-1467 and NUTCH-1560: add all values of multi-valued metatags

Added:
    nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html   (with props)
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
    nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
    nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
    nutch/trunk/src/plugin/parse-metatags/build.xml
    nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun 13 20:45:37 2013
@@ -2,6 +2,10 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1560 index-metadata to add all values of multivalued metadata (snagel)
+
+* NUTCH-1467 Not able to parse mutliValued metatags (kiran via snagel)
+
 * NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule (markus)
 
 * NUTCH-1522 Upgrade to Tika 1.3 (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Thu Jun 13 20:45:37 2013
@@ -21,6 +21,8 @@ import java.net.URL;
 import java.util.Iterator;
 import java.util.Properties;
 
+import org.apache.nutch.metadata.Metadata;
+
 /**
  * This class holds the information about HTML "meta" tags extracted from 
  * a page. Some special tags have convenience methods for easy checking.
@@ -40,10 +42,10 @@ public class HTMLMetaTags {
 
   private URL refreshHref = null;
 
-  private Properties generalTags = new Properties();
+  private Metadata generalTags = new Metadata();
 
   private Properties httpEquivTags = new Properties();
-
+  
   /**
    * Sets all boolean values to <code>false</code>. Clears all other tags.
    */
@@ -166,7 +168,7 @@ public class HTMLMetaTags {
    * Returns all collected values of the general meta tags. Property names are
    * tag names, property values are "content" values.
    */
-  public Properties getGeneralTags() {
+  public Metadata getGeneralTags() {
     return generalTags;
   }
 
@@ -188,12 +190,13 @@ public class HTMLMetaTags {
             + ", refreshHref=" + refreshHref + "\n"
             );
     sb.append(" * general tags:\n");
-    Iterator<Object> it = generalTags.keySet().iterator();
-    while (it.hasNext()) {
-      String key = (String)it.next();
+    String[] names = generalTags.names();
+    for (String name : names) {
+      String key = name;
       sb.append("   - " + key + "\t=\t" + generalTags.get(key) + "\n");
     }
     sb.append(" * http-equiv tags:\n");
+    Iterator<Object> it = httpEquivTags.keySet().iterator();
     it = httpEquivTags.keySet().iterator();
     while (it.hasNext()) {
       String key = (String)it.next();

Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java Thu Jun 13 20:45:37 2013
@@ -65,18 +65,20 @@ public class MetadataIndexer implements 
 		// add the fields from parsemd
 		if (parseFieldnames != null) {
 			for (String metatag : parseFieldnames) {
-				String value = parse.getData().getParseMeta().get(metatag);
-				if (value != null)
-					doc.add(metatag, value);
+				for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+					if (value != null)
+						doc.add(metatag, value);
+				}
 			}
 		}
 
 		// add the fields from contentmd
 		if (contentFieldnames != null) {
 			for (String metatag : contentFieldnames) {
-				String value = parse.getData().getContentMeta().get(metatag);
-				if (value != null)
-					doc.add(metatag, value);
+				for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+					if (value != null)
+						doc.add(metatag, value);
+				}
 			}
 		}
 

Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Thu Jun 13 20:45:37 2013
@@ -78,7 +78,7 @@ public class HTMLMetaProcessor {
         if (nameNode != null) {
           if (contentNode != null) {
             String name = nameNode.getNodeValue().toLowerCase();
-            metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
   
               if (contentNode != null) {

Modified: nutch/trunk/src/plugin/parse-metatags/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/build.xml?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/build.xml (original)
+++ nutch/trunk/src/plugin/parse-metatags/build.xml Thu Jun 13 20:45:37 2013
@@ -28,6 +28,10 @@
 
 	<!-- for junit test -->
 	<mkdir dir="${build.test}/data" />
-	<copy file="sample/testMetatags.html" todir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
 
 </project>

Added: nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html?rev=1492856&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html (added)
+++ nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html Thu Jun 13 20:45:37 2013
@@ -0,0 +1,12 @@
+<html>
+<head>
+<meta name="DC.creator" content="Doug Cutting">
+<meta name="DC.creator" content="Michael Cafarella">
+<!-- meta keywords in different casing -->
+<meta name="keywords" lang="en" content="web crawler" />
+<meta name="Keywords" lang="fr" content="robot d'indexation" />
+<meta name="KEYWORDS" lang="de" content="Webcrawler" />
+</head>
+<body>
+A test for multi-valued metatags.
+</body>
\ No newline at end of file

Propchange: nutch/trunk/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java Thu Jun 13 20:45:37 2013
@@ -75,16 +75,17 @@ public class MetaTagsParser implements H
       }
     }
 
-    Properties generalMetaTags = metaTags.getGeneralTags();
-    for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
-        .hasMoreElements();) {
-      String name = (String) tagNames.nextElement();
-      String value = generalMetaTags.getProperty(name);
+    Metadata generalMetaTags = metaTags.getGeneralTags();
+    for (String tagName : generalMetaTags.names() ) {
+    String[] tagValues = generalMetaTags.getValues(tagName);    
+  
+      for ( String tagValue : tagValues ) {
       // check whether the name is in the list of what we want or if
       // specified *
-      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
-        LOG.debug("Found meta tag : " + name + "\t" + value);
-        metadata.add("metatag." + name.toLowerCase(), value);
+    	 if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
+    		 LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
+    		 metadata.add("metatag." + tagName.toLowerCase(), tagValue);
+    	 }
       }
     }
 

Modified: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (original)
+++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java Thu Jun 13 20:45:37 2013
@@ -17,6 +17,9 @@
 
 package org.apache.nutch.parse.html;
 
+import java.util.Set;
+import java.util.TreeSet;
+
 import junit.framework.TestCase;
 
 import org.apache.hadoop.conf.Configuration;
@@ -35,6 +38,7 @@ public class TestMetatagParser extends T
   private String fileSeparator = System.getProperty("file.separator");
   private String sampleDir = System.getProperty("test.data", ".");
   private String sampleFile = "testMetatags.html";
+  private String sampleFileMultival = "testMultivalueMetatags.html";
   private String description = "This is a test of description";
   private String keywords = "This is a test of keywords";
   
@@ -42,27 +46,58 @@ public class TestMetatagParser extends T
     super(name);
   }
   
-  public void testIt() {
-    Configuration conf = NutchConfiguration.create();
-    
-    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
-    
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
     try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;     
       Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
       Content content = protocol.getProtocolOutput(new Text(urlString),
           new CrawlDatum()).getContent();
-      
       Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-      
-      // check that we get the same values
-      Metadata parseMeta = parse.getData().getParseMeta();
-      
-      assertEquals(description, parseMeta.get("metatag.description"));
-      assertEquals(keywords, parseMeta.get("metatag.keywords"));
+      metadata = parse.getData().getParseMeta();
     } catch (Exception e) {
       e.printStackTrace();
       fail(e.toString());
     }
+    return metadata;
+  }
+
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+    
+    // check that we get the same values
+    Metadata parseMeta= parseMeta(sampleFile, conf);
+      
+    assertEquals(description, parseMeta.get("metatag.description"));
+    assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+
+  public void testMultiValueMetatags() {
+    Configuration conf = NutchConfiguration.create();
+    conf.set("metatags.names", "keywords;DC.creator");
+    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+    
+    String failMessage = "One value of metatag with multiple values is missing: ";
+
+    Set<String> valueSet = new TreeSet<String>();
+    for (String val : parseMeta.getValues("metatag.dc.creator")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
+    for (String val : expectedValues1) {
+      assertTrue(failMessage + val, valueSet.contains(val));      
+    }
+    
+    valueSet.clear();
+    for (String val : parseMeta.getValues("metatag.keywords")) {
+      valueSet.add(val);
+    }
+    String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
+    for (String val : expectedValues2) {
+      assertTrue(failMessage + val, valueSet.contains(val));      
+    }
   }
   
 }

Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=1492856&r1=1492855&r2=1492856&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (original)
+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Thu Jun 13 20:45:37 2013
@@ -49,7 +49,7 @@ public class HTMLMetaProcessor {
 
   private static final void getMetaTagsHelper(
     HTMLMetaTags metaTags, Node node, URL currURL) {
-
+	  
     if (node.getNodeType() == Node.ELEMENT_NODE) {
 
       if ("body".equalsIgnoreCase(node.getNodeName())) {
@@ -77,8 +77,8 @@ public class HTMLMetaProcessor {
         
         if (nameNode != null) {
           if (contentNode != null) {
-            String name = nameNode.getNodeValue().toLowerCase();
-            metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+            String name = nameNode.getNodeValue().toLowerCase();   
+            metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
   
               if (contentNode != null) {