You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2009/11/25 19:08:24 UTC

svn commit: r884224 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Author: ab
Date: Wed Nov 25 18:08:24 2009
New Revision: 884224

URL: http://svn.apache.org/viewvc?rev=884224&view=rev
Log:
NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884224&r1=884223&r2=884224&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 18:08:24 2009
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab)
+
 * NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab)
 
 * NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=884224&r1=884223&r2=884224&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Nov 25 18:08:24 2009
@@ -64,13 +64,20 @@
     boolean fetchSet = false;
     boolean oldSet = false;
     byte[] signature = null;
+    boolean multiple = false; // avoid deep copy when only single value exists
     linked.clear();
 
     while (values.hasNext()) {
       CrawlDatum datum = (CrawlDatum)values.next();
+      if (!multiple && values.hasNext()) multiple = true;
       if (CrawlDatum.hasDbStatus(datum)) {
         if (!oldSet) {
-          old.set(datum);
+          if (multiple) {
+            old.set(datum);
+          } else {
+            // no need for a deep copy - this is the only value
+            old = datum;
+          }
           oldSet = true;
         } else {
           // always take the latest version
@@ -81,7 +88,11 @@
 
       if (CrawlDatum.hasFetchStatus(datum)) {
         if (!fetchSet) {
-          fetch.set(datum);
+          if (multiple) {
+            fetch.set(datum);
+          } else {
+            fetch = datum;
+          }
           fetchSet = true;
         } else {
           // always take the latest version
@@ -92,8 +103,13 @@
 
       switch (datum.getStatus()) {                // collect other info
       case CrawlDatum.STATUS_LINKED:
-        CrawlDatum link = new CrawlDatum();
-        link.set(datum);
+        CrawlDatum link;
+        if (multiple) {
+          link = new CrawlDatum();
+          link.set(datum);
+        } else {
+          link = datum;
+        }
         linked.add(link);
         break;
       case CrawlDatum.STATUS_SIGNATURE:
@@ -115,10 +131,11 @@
     
     // still no new data - record only unchanged old data, if exists, and return
     if (!fetchSet) {
-      if (oldSet) // at this point at least "old" should be present
+      if (oldSet) {// at this point at least "old" should be present
         output.collect(key, old);
-      else
+      } else {
         LOG.warn("Missing fetch and old value, signature=" + signature);
+      }
       return;
     }