You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/01/06 18:01:53 UTC

svn commit: r896539 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java

Author: jnioche
Date: Wed Jan  6 17:01:51 2010
New Revision: 896539

URL: http://svn.apache.org/viewvc?rev=896539&view=rev
Log:
NUTCH-655 : Injecting Crawl metadata (jnioche)

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=896539&r1=896538&r2=896539&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jan  6 17:01:51 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-655 Injecting Crawl metadata (jnioche)
+
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)
 
 * NUTCH-777 Upgrading to jetty6 broke unit tests (mattmann)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=896539&r1=896538&r2=896539&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Jan  6 17:01:51 2010
@@ -37,10 +37,21 @@
 import org.apache.nutch.util.NutchJob;
 
 /** This class takes a flat file of URLs and adds them to the of pages to be
- * crawled.  Useful for bootstrapping the system. */
+ * crawled.  Useful for bootstrapping the system. 
+ * The URL files contain one URL per line, optionally followed by custom metadata 
+ * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
+ * Note that some metadata keys are reserved : <br>
+ * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
+ * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
+ * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
+ **/
 public class Injector extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(Injector.class);
-
+  
+  /** metadata key reserved for setting a custom score for a specific URL */
+  public static String nutchScoreMDName = "nutch.score";
+  /** metadata key reserved for setting a custom fetchInterval for a specific URL */
+  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
 
   /** Normalize and filter injected urls. */
   public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
@@ -68,6 +79,36 @@
                     OutputCollector<Text, CrawlDatum> output, Reporter reporter)
       throws IOException {
       String url = value.toString();              // value is line of text
+      // if tabs : metadata that could be stored
+      // must be name=value and separated by \t
+      float customScore = -1f;
+      int customInterval = interval;
+      Map<String,String> metadata = new TreeMap<String,String>();
+      if (url.indexOf("\t")!=-1){
+    	  String[] splits = url.split("\t");
+    	  url = splits[0];
+    	  for (int s=1;s<splits.length;s++){
+    		  // find separation between name and value
+    		  int indexEquals = splits[s].indexOf("=");
+    		  if (indexEquals==-1) {
+    			  // skip anything without a =
+    			  continue;		    
+    		  }
+    		  String metaname = splits[s].substring(0, indexEquals);
+    		  String metavalue = splits[s].substring(indexEquals+1);
+    		  if (metaname.equals(nutchScoreMDName)) {
+    			  try {
+    			  customScore = Float.parseFloat(metavalue);}
+    			  catch (NumberFormatException nfe){}
+    		  }
+    		  else if (metaname.equals(nutchFetchIntervalMDName)) {
+    			  try {
+    				  customInterval = Integer.parseInt(metavalue);}
+    			  catch (NumberFormatException nfe){}
+    		  }
+    		  else metadata.put(metaname,metavalue);
+    	  }
+      }
       try {
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
         url = filters.filter(url);             // filter the url
@@ -77,17 +118,27 @@
       }
       if (url != null) {                          // if it passes
         value.set(url);                           // collect it
-        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, interval);
+        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
         datum.setFetchTime(curTime);
-        datum.setScore(scoreInjected);
-        try {
-          scfilters.injectedScore(value, datum);
-        } catch (ScoringFilterException e) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter injected score for url " + url +
-                     ", using default (" + e.getMessage() + ")");
-          }
+        if (customScore != -1) datum.setScore(customScore);
+        else {
           datum.setScore(scoreInjected);
+          try {
+            scfilters.injectedScore(value, datum);
+          } catch (ScoringFilterException e) {
+            if (LOG.isWarnEnabled()) {
+              LOG.warn("Cannot filter injected score for url " + url
+                  + ", using default (" + e.getMessage() + ")");
+            }
+            datum.setScore(scoreInjected);
+          }
+        }
+        // now add the metadata
+        Iterator<String> keysIter = metadata.keySet().iterator();
+        while (keysIter.hasNext()){
+        	String keymd = keysIter.next();
+        	String valuemd = metadata.get(keymd);
+        	datum.getMetaData().put(new Text(keymd), new Text(valuemd));
         }
         output.collect(value, datum);
       }