You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2009/11/25 19:08:24 UTC
svn commit: r884224 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: ab
Date: Wed Nov 25 18:08:24 2009
New Revision: 884224
URL: http://svn.apache.org/viewvc?rev=884224&view=rev
Log:
NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884224&r1=884223&r2=884224&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 18:08:24 2009
@@ -2,6 +2,8 @@
Unreleased Changes
+* NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab)
+
* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab)
* NUTCH-773 - Some minor bugs in AbstractFetchSchedule (Reinhard Schwab via ab)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=884224&r1=884223&r2=884224&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Nov 25 18:08:24 2009
@@ -64,13 +64,20 @@
boolean fetchSet = false;
boolean oldSet = false;
byte[] signature = null;
+ boolean multiple = false; // avoid deep copy when only single value exists
linked.clear();
while (values.hasNext()) {
CrawlDatum datum = (CrawlDatum)values.next();
+ if (!multiple && values.hasNext()) multiple = true;
if (CrawlDatum.hasDbStatus(datum)) {
if (!oldSet) {
- old.set(datum);
+ if (multiple) {
+ old.set(datum);
+ } else {
+ // no need for a deep copy - this is the only value
+ old = datum;
+ }
oldSet = true;
} else {
// always take the latest version
@@ -81,7 +88,11 @@
if (CrawlDatum.hasFetchStatus(datum)) {
if (!fetchSet) {
- fetch.set(datum);
+ if (multiple) {
+ fetch.set(datum);
+ } else {
+ fetch = datum;
+ }
fetchSet = true;
} else {
// always take the latest version
@@ -92,8 +103,13 @@
switch (datum.getStatus()) { // collect other info
case CrawlDatum.STATUS_LINKED:
- CrawlDatum link = new CrawlDatum();
- link.set(datum);
+ CrawlDatum link;
+ if (multiple) {
+ link = new CrawlDatum();
+ link.set(datum);
+ } else {
+ link = datum;
+ }
linked.add(link);
break;
case CrawlDatum.STATUS_SIGNATURE:
@@ -115,10 +131,11 @@
// still no new data - record only unchanged old data, if exists, and return
if (!fetchSet) {
- if (oldSet) // at this point at least "old" should be present
+ if (oldSet) {// at this point at least "old" should be present
output.collect(key, old);
- else
+ } else {
LOG.warn("Missing fetch and old value, signature=" + signature);
+ }
return;
}