You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2009/02/11 10:12:15 UTC
svn commit: r743277 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDbMerger.java
Author: dogacan
Date: Wed Feb 11 09:12:15 2009
New Revision: 743277
URL: http://svn.apache.org/viewvc?rev=743277&view=rev
Log:
NUTCH-683 - NUTCH-676 broke CrawlDbMerger
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=743277&r1=743276&r2=743277&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Feb 11 09:12:15 2009
@@ -338,6 +338,7 @@
126. NUTCH-636 - Httpclient plugin https doesn't work on IBM JRE
(Curtis d'Entremont, ab)
+127. NUTCH-683 - NUTCH-676 broke CrawlDbMerger. (dogacan)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=743277&r1=743276&r2=743277&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Feb 11 09:12:15 2009
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.util.*;
+import java.util.Map.Entry;
// Commons Logging imports
import org.apache.commons.logging.Log;
@@ -28,6 +29,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
@@ -53,7 +55,7 @@
private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
- private org.apache.hadoop.io.MapWritable meta = new org.apache.hadoop.io.MapWritable();
+ private org.apache.hadoop.io.MapWritable meta;
private CrawlDatum res = new CrawlDatum();
private FetchSchedule schedule;
@@ -67,26 +69,32 @@
throws IOException {
long resTime = 0L;
boolean resSet = false;
- meta.clear();
+ meta = new org.apache.hadoop.io.MapWritable();
while (values.hasNext()) {
CrawlDatum val = values.next();
if (!resSet) {
res.set(val);
resSet = true;
resTime = schedule.calculateLastFetchTime(res);
- meta.putAll(res.getMetaData());
+ for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
+ meta.put(e.getKey(), e.getValue());
+ }
continue;
}
// compute last fetch time, and pick the latest
long valTime = schedule.calculateLastFetchTime(val);
if (valTime > resTime) {
// collect all metadata, newer values override older values
- meta.putAll(val.getMetaData());
+ for (Entry<Writable, Writable> e : val.getMetaData().entrySet()) {
+ meta.put(e.getKey(), e.getValue());
+ }
res.set(val);
resTime = valTime ;
} else {
// insert older metadata before newer
- val.getMetaData().putAll(meta);
+ for (Entry<Writable, Writable> e : meta.entrySet()) {
+ val.getMetaData().put(e.getKey(), e.getValue());
+ }
meta = val.getMetaData();
}
}