You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/21 21:50:14 UTC

svn commit: r1596662 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParserChecker.java

Author: jnioche
Date: Wed May 21 19:50:14 2014
New Revision: 1596662

URL: http://svn.apache.org/r1596662
Log:
NUTCH-1757 ParserChecker to take custom metadata as input

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1596662&r1=1596661&r2=1596662&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 21 19:50:14 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)
+
 * NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
 
 * NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1596662&r1=1596661&r2=1596662&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May 21 19:50:14 2014
@@ -17,6 +17,9 @@
 
 package org.apache.nutch.parse;
 
+import java.util.HashMap;
+import java.util.Iterator;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -29,6 +32,7 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.StringUtil;
@@ -73,19 +77,32 @@ public class ParserChecker implements To
     String contentType = null;
     String url = null;
 
-    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
+    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
 
     if (args.length == 0) {
       LOG.error(usage);
       return (-1);
     }
 
+    // used to simulate the metadata propagated from injection
+    HashMap<String, String> metadata = new HashMap<String, String>();
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-forceAs")) {
         force = true;
         contentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
       } else if (i != args.length - 1) {
         LOG.error(usage);
         System.exit(-1);
@@ -98,9 +115,21 @@ public class ParserChecker implements To
       LOG.info("fetching: " + url);
     }
 
+    CrawlDatum cd = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      cd.getMetaData().put(new Text(key), new Text(value));
+    }
+
     ProtocolFactory factory = new ProtocolFactory(conf);
     Protocol protocol = factory.getProtocol(url);
-    ProtocolOutput output = protocol.getProtocolOutput(new Text(url), new CrawlDatum());
+    Text turl = new Text(url);
+    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
     
     if (!output.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: " + output.getStatus());
@@ -129,6 +158,16 @@ public class ParserChecker implements To
       LOG.warn("Content is truncated, parse may fail!");
     }
 
+    ScoringFilters scfilters = new ScoringFilters(conf);
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, cd, content);
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
+      }
+    }    
+    
     ParseResult parseResult = new ParseUtil(conf).parse(content);
 
     if (parseResult == null) {
@@ -145,6 +184,15 @@ public class ParserChecker implements To
       LOG.info("signature: " + StringUtil.toHexString(signature));
     }
 
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+    } catch (Exception e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
+      }
+    }
+
     for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
       Parse parse = entry.getValue();
       LOG.info("---------\nUrl\n---------------\n");