You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/01/08 13:01:51 UTC

svn commit: r897180 - in /lucene/nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Author: jnioche
Date: Fri Jan  8 12:01:46 2010
New Revision: 897180

URL: http://svn.apache.org/viewvc?rev=897180&view=rev
Log:
NUTCH-269 : OOME because no upper-bound on inlinks count (stack + jnioche)

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=897180&r1=897179&r2=897180&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Jan  8 12:01:46 2010
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-269 CrawlDbReducer: OOME because no upper-bound on inlinks count (stack + jnioche)
+
 * NUTCH-655 Injecting Crawl metadata (jnioche)
 
 * NUTCH-658 Use counters to report fetching and parsing status (jnioche)

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=897180&r1=897179&r2=897180&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Jan  8 12:01:46 2010
@@ -384,6 +384,14 @@
 </property>
 
 <property>
+  <name>db.update.max.inlinks</name>
+  <value>10000</value>
+  <description>Maximum number of inlinks to take into account when updating 
+  a URL score in the crawlDB. Only the best scoring inlinks are kept. 
+  </description>
+</property>
+
+<property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=897180&r1=897179&r2=897180&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Jan  8 12:01:46 2010
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 import java.io.IOException;
 
 // Commons Logging imports
@@ -27,6 +28,7 @@
 
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.PriorityQueue;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
@@ -37,7 +39,7 @@
   
   private int retryMax;
   private CrawlDatum result = new CrawlDatum();
-  private ArrayList<CrawlDatum> linked = new ArrayList<CrawlDatum>();
+  private InlinkPriorityQueue linked = null;
   private ScoringFilters scfilters = null;
   private boolean additionsAllowed;
   private int maxInterval;
@@ -51,6 +53,8 @@
     maxInterval = job.getInt("db.fetch.interval.max", 0 );
     if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
     schedule = FetchScheduleFactory.getFetchSchedule(job);
+    int maxLinks = job.getInt("db.update.max.inlinks", 10000);
+    linked = new InlinkPriorityQueue(maxLinks);
   }
 
   public void close() {}
@@ -111,7 +115,7 @@
         } else {
           link = datum;
         }
-        linked.add(link);
+        linked.insert(link);
         break;
       case CrawlDatum.STATUS_SIGNATURE:
         signature = datum.getSignature();
@@ -120,13 +124,21 @@
         LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
       }
     }
-
+    
+    // copy the content of the queue into a List
+    // in reversed order
+    int numLinks = linked.size();
+    List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks);
+    for (int i = numLinks - 1; i >= 0; i--) {
+      linkList.add(linked.pop());
+    }
+    
     // if it doesn't already exist, skip it
     if (!oldSet && !additionsAllowed) return;
     
     // if there is no fetched datum, perhaps there is a link
-    if (!fetchSet && linked.size() > 0) {
-      fetch = linked.get(0);
+    if (!fetchSet && linkList.size() > 0) {
+      fetch = linkList.get(0);
       fetchSet = true;
     }
     
@@ -260,7 +272,7 @@
     }
 
     try {
-      scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linked);
+      scfilters.updateDbScore((Text)key, oldSet ? old : null, result, linkList);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Couldn't update score, key=" + key + ": " + e);
@@ -270,5 +282,20 @@
     result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
     output.collect(key, result);
   }
+  
+}
 
+class InlinkPriorityQueue extends PriorityQueue<CrawlDatum> {
+  
+  public InlinkPriorityQueue(int maxSize) {
+    initialize(maxSize);
+  }
+  
+  /** Determines the ordering of objects in this priority queue. **/
+  protected boolean lessThan(Object arg0, Object arg1) {
+    CrawlDatum candidate = (CrawlDatum) arg0;
+    CrawlDatum least = (CrawlDatum) arg1;
+    return candidate.getScore() > least.getScore();
+  }
+  
 }