You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/07/05 18:57:57 UTC

svn commit: r1357739 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/Injector.java src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java src/test/org/apache/nutch/crawl/TestInjector.java

Author: markus
Date: Thu Jul  5 16:57:57 2012
New Revision: 1357739

URL: http://svn.apache.org/viewvc?rev=1357739&view=rev
Log:
NUTCH-1405 Allow to overwrite CrawlDatum's with injected entries

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul  5 16:57:57 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1405 Allow to overwrite CrawlDatum's with injected entries (markus)
+
 * NUTCH-1412 Upgrade commons lang (markus)
 
 * NUTCH-1251 SolrDedup to use proper Lucene catch-all query (Arkadi Kosmynin via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Jul  5 16:57:57 2012
@@ -468,6 +468,23 @@
   </description>
 </property>
 
+ <property>
+  <name>db.injector.overwrite</name>
+  <value>false</value>
+  <description>Whether existing records in the CrawlDB will be overwritten
+  by injected records.
+  </description>
+</property>
+
+<property>
+  <name>db.injector.update</name>
+  <value>false</value>
+  <description>If true existing records in the CrawlDB will be updated with
+  injected records. Old meta data is preserved. The db.injector.overwrite
+  parameter has precedence.
+  </description>
+</property>
+
 <property>
   <name>db.score.injected</name>
   <value>1.0</value>

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Jul  5 16:57:57 2012
@@ -152,7 +152,18 @@ public class Injector extends Configured
 
   /** Combine multiple new entries for a url. */
   public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
-    public void configure(JobConf job) {}    
+    private int interval;
+    private float scoreInjected;
+    private boolean overwrite = false;
+    private boolean update = false;
+
+    public void configure(JobConf job) {
+      interval = job.getInt("db.fetch.interval.default", 2592000);
+      scoreInjected = job.getFloat("db.score.injected", 1.0f);
+      overwrite = job.getBoolean("db.injector.overwrite", false);
+      update = job.getBoolean("db.injector.update", false);
+    }
+    
     public void close() {}
 
     private CrawlDatum old = new CrawlDatum();
@@ -162,19 +173,48 @@ public class Injector extends Configured
                        OutputCollector<Text, CrawlDatum> output, Reporter reporter)
       throws IOException {
       boolean oldSet = false;
+      boolean injectedSet = false;
       while (values.hasNext()) {
         CrawlDatum val = values.next();
         if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
           injected.set(val);
           injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+          injectedSet = true;
         } else {
           old.set(val);
           oldSet = true;
         }
       }
       CrawlDatum res = null;
-      if (oldSet) res = old; // don't overwrite existing value
-      else res = injected;
+      
+      /**
+       * Whether to overwrite, ignore or update existing records
+       * @see https://issues.apache.org/jira/browse/NUTCH-1405
+       */
+      
+      // Injected record already exists and overwrite but not update
+      if (injectedSet && oldSet && overwrite) {
+        res = injected;
+        
+        if (update) {
+          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
+        }
+      }
+
+      // Injected record already exists and update but not overwrite
+      if (injectedSet && oldSet && update && !overwrite) {
+        res = old;
+        old.putAllMetaData(injected);
+        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
+        old.setFetchInterval(injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval());
+      }
+      
+      // Old default behaviour
+      if (injectedSet && !oldSet) {
+        res = injected;
+      } else {
+        res = old;
+      }
 
       output.collect(key, res);
     }

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Thu Jul  5 16:57:57 2012
@@ -18,6 +18,7 @@ package org.apache.nutch.crawl;
 
 import java.io.IOException;
 import java.net.UnknownHostException;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 
@@ -104,22 +105,43 @@ public class CrawlDBTestUtil {
       this.datum = datum;
     }
   }
+
+  /**
+   * Generate seedlist
+   * @throws IOException 
+   */
+  public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls) throws IOException{
+    generateSeedList(fs, urlPath, urls, new ArrayList<String>());
+  }
   
   /**
    * Generate seedlist
    * @throws IOException 
    */
-  public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{
+  public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls, List<String>metadata) throws IOException{
     FSDataOutputStream out;
     Path file=new Path(urlPath,"urls.txt");
     fs.mkdirs(urlPath);
     out=fs.create(file);
-    Iterator<String> iterator=contents.iterator();
-    while(iterator.hasNext()){
-      String url=iterator.next();
+    
+    Iterator<String> urls_i=urls.iterator();
+    Iterator<String> metadata_i=metadata.iterator();
+    
+    String url;
+    String md;
+    while(urls_i.hasNext()){
+      url=urls_i.next();
+
       out.writeBytes(url);
+            
+      if (metadata_i.hasNext()) {
+        md = metadata_i.next();
+        out.writeBytes(md);
+      }
+
       out.writeBytes("\n");
     }
+    
     out.flush();
     out.close();
   }

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Thu Jul  5 16:57:57 2012
@@ -19,7 +19,9 @@ package org.apache.nutch.crawl;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -62,10 +64,13 @@ public class TestInjector extends TestCa
 
   public void testInject() throws IOException {
     ArrayList<String> urls=new ArrayList<String>();
+    // We'll use a separate list for MD so we can still compare url with containsAll
+    ArrayList<String> metadata=new ArrayList<String>();
     for(int i=0;i<100;i++) {
       urls.add("http://zzz.com/" + i + ".html");
+      metadata.add("\tnutch.score=2." + i + "\tnutch.fetchInterval=171717\tkey=value");
     }
-    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
     
     Injector injector=new Injector(conf);
     injector.inject(crawldbPath, urlPath);
@@ -85,23 +90,44 @@ public class TestInjector extends TestCa
     ArrayList<String> urls2=new ArrayList<String>();
     for(int i=0;i<100;i++) {
       urls2.add("http://xxx.com/" + i + ".html");
+      // We'll overwrite previously injected records but preserve their original MD
+      urls2.add("http://zzz.com/" + i + ".html");
     }
     CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
+    injector=new Injector(conf);
+    conf.setBoolean("db.injector.update", true);
     injector.inject(crawldbPath, urlPath);
     urls.addAll(urls2);
     
     // verify results
     read=readCrawldb();
-    
 
     Collections.sort(read);
     Collections.sort(urls);
 
-    assertEquals(urls.size(), read.size());
+    // We should have 100 less records because we've overwritten
+    assertEquals(urls.size() - 100, read.size());
     
     assertTrue(read.containsAll(urls));
     assertTrue(urls.containsAll(read));
     
+    // Check if we correctly preserved MD
+    Map<String, CrawlDatum> records = readCrawldbRecords();
+    
+    // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
+    // so we can check for MD and score and interval
+    Text writableKey = new Text("key");
+    Text writableValue = new Text("value");
+    for (String url : urls) {
+      if (url.indexOf("http://zzz") == 0) {       
+        // Check for fetch interval
+        assertTrue(records.get(url).getFetchInterval() == 171717);
+        // Check for default score
+        assertTrue(records.get(url).getScore() != 1.0);
+        // Check for MD key=value
+        assertEquals(writableValue, records.get(url).getMetaData().get(writableKey));
+      }
+    }
   }
   
   private List<String> readCrawldb() throws IOException{
@@ -120,5 +146,21 @@ public class TestInjector extends TestCa
 
     return read;
   }
+  
+  private HashMap<String,CrawlDatum> readCrawldbRecords() throws IOException{
+    Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
+    System.out.println("reading:" + dbfile);
+    SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
+    HashMap<String,CrawlDatum> read=new HashMap<String,CrawlDatum>();
+    
+    READ:
+      do {
+      Text key=new Text();
+      CrawlDatum value=new CrawlDatum();
+      if(!reader.next(key, value)) break READ;
+      read.put(key.toString(), value);
+    } while(true);
 
+    return read;
+  }
 }