You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/21 21:50:14 UTC
svn commit: r1596662 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/parse/ParserChecker.java
Author: jnioche
Date: Wed May 21 19:50:14 2014
New Revision: 1596662
URL: http://svn.apache.org/r1596662
Log:
NUTCH-1757 ParserChecker to take custom metadata as input
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1596662&r1=1596661&r2=1596662&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 21 19:50:14 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1757 ParserChecker to take custom metadata as input (jnioche)
+
* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus)
* NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1596662&r1=1596661&r2=1596662&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May 21 19:50:14 2014
@@ -17,6 +17,9 @@
package org.apache.nutch.parse;
+import java.util.HashMap;
+import java.util.Iterator;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -29,6 +32,7 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.StringUtil;
@@ -73,19 +77,32 @@ public class ParserChecker implements To
String contentType = null;
String url = null;
- String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
+ String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
if (args.length == 0) {
LOG.error(usage);
return (-1);
}
+ // used to simulate the metadata propagated from injection
+ HashMap<String, String> metadata = new HashMap<String, String>();
+
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-forceAs")) {
force = true;
contentType = args[++i];
} else if (args[i].equals("-dumpText")) {
dumpText = true;
+ } else if (args[i].equals("-md")) {
+ String k = null, v = null;
+ String nextOne = args[++i];
+ int firstEquals = nextOne.indexOf("=");
+ if (firstEquals != -1) {
+ k = nextOne.substring(0, firstEquals);
+ v = nextOne.substring(firstEquals + 1);
+ } else
+ k = nextOne;
+ metadata.put(k, v);
} else if (i != args.length - 1) {
LOG.error(usage);
System.exit(-1);
@@ -98,9 +115,21 @@ public class ParserChecker implements To
LOG.info("fetching: " + url);
}
+ CrawlDatum cd = new CrawlDatum();
+
+ Iterator<String> iter = metadata.keySet().iterator();
+ while (iter.hasNext()) {
+ String key = iter.next();
+ String value = metadata.get(key);
+ if (value == null)
+ value = "";
+ cd.getMetaData().put(new Text(key), new Text(value));
+ }
+
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
- ProtocolOutput output = protocol.getProtocolOutput(new Text(url), new CrawlDatum());
+ Text turl = new Text(url);
+ ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
if (!output.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: " + output.getStatus());
@@ -129,6 +158,16 @@ public class ParserChecker implements To
LOG.warn("Content is truncated, parse may fail!");
}
+ ScoringFilters scfilters = new ScoringFilters(conf);
+ // call the scoring filters
+ try {
+ scfilters.passScoreBeforeParsing(turl, cd, content);
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't pass score, url " + turl.toString() + " (" + e + ")");
+ }
+ }
+
ParseResult parseResult = new ParseUtil(conf).parse(content);
if (parseResult == null) {
@@ -145,6 +184,15 @@ public class ParserChecker implements To
LOG.info("signature: " + StringUtil.toHexString(signature));
}
+ // call the scoring filters
+ try {
+ scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Couldn't pass score, url " + turl + " (" + e + ")");
+ }
+ }
+
for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
Parse parse = entry.getValue();
LOG.info("---------\nUrl\n---------------\n");