You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/01/06 18:01:53 UTC
svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/Injector.java
Author: jnioche
Date: Wed Jan 6 17:01:51 2010
New Revision: 896539
URL: http://svn.apache.org/viewvc?rev=896539&view=rev
Log:
NUTCH-655 : Injecting Crawl metadata (jnioche)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539&r1=896538&r2=896539&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan 6 17:01:51 2010
@@ -2,6 +2,8 @@
Unreleased Changes
+* NUTCH-655 Injecting Crawl metadata (jnioche)
+
* NUTCH-658 Use counters to report fetching and parsing status (jnioche)
* NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539&r1=896538&r2=896539&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan 6 17:01:51 2010
@@ -37,10 +37,21 @@
import org.apache.nutch.util.NutchJob;
/** This class takes a flat file of URLs and adds them to the of pages to be
- * crawled. Useful for bootstrapping the system. */
+ * crawled. Useful for bootstrapping the system.
+ * The URL files contain one URL per line, optionally followed by custom metadata
+ * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
+ * Note that some metadata keys are reserved : <br>
+ * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
+ * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
+ * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
+ **/
public class Injector extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(Injector.class);
-
+
+ /** metadata key reserved for setting a custom score for a specific URL */
+ public static String nutchScoreMDName = "nutch.score";
+ /** metadata key reserved for setting a custom fetchInterval for a specific URL */
+ public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
/** Normalize and filter injected urls. */
public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
@@ -68,6 +79,36 @@
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
String url = value.toString(); // value is line of text
+ // if tabs : metadata that could be stored
+ // must be name=value and separated by \t
+ float customScore = -1f;
+ int customInterval = interval;
+ Map<String,String> metadata = new TreeMap<String,String>();
+ if (url.indexOf("\t")!=-1){
+ String[] splits = url.split("\t");
+ url = splits[0];
+ for (int s=1;s<splits.length;s++){
+ // find separation between name and value
+ int indexEquals = splits[s].indexOf("=");
+ if (indexEquals==-1) {
+ // skip anything without a =
+ continue;
+ }
+ String metaname = splits[s].substring(0, indexEquals);
+ String metavalue = splits[s].substring(indexEquals+1);
+ if (metaname.equals(nutchScoreMDName)) {
+ try {
+ customScore = Float.parseFloat(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
+ else if (metaname.equals(nutchFetchIntervalMDName)) {
+ try {
+ customInterval = Integer.parseInt(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
+ else metadata.put(metaname,metavalue);
+ }
+ }
try {
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
url = filters.filter(url); // filter the url
@@ -77,17 +118,27 @@
}
if (url != null) { // if it passes
value.set(url); // collect it
- CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, interval);
+ CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
datum.setFetchTime(curTime);
- datum.setScore(scoreInjected);
- try {
- scfilters.injectedScore(value, datum);
- } catch (ScoringFilterException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Cannot filter injected score for url " + url +
- ", using default (" + e.getMessage() + ")");
- }
+ if (customScore != -1) datum.setScore(customScore);
+ else {
datum.setScore(scoreInjected);
+ try {
+ scfilters.injectedScore(value, datum);
+ } catch (ScoringFilterException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Cannot filter injected score for url " + url
+ + ", using default (" + e.getMessage() + ")");
+ }
+ datum.setScore(scoreInjected);
+ }
+ }
+ // now add the metadata
+ Iterator<String> keysIter = metadata.keySet().iterator();
+ while (keysIter.hasNext()){
+ String keymd = keysIter.next();
+ String valuemd = metadata.get(keymd);
+ datum.getMetaData().put(new Text(keymd), new Text(valuemd));
}
output.collect(value, datum);
}