You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by cu...@apache.org on 2005/07/10 23:11:22 UTC

svn commit: r210034 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Author: cutting
Date: Sun Jul 10 14:11:20 2005
New Revision: 210034

URL: http://svn.apache.org/viewcvs?rev=210034&view=rev
Log:
Fix so that fetcher does not split its input files, since they're
already split by host and should not be subdivided.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=210034&r1=210033&r2=210034&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Sun Jul 10 14:11:20 2005
@@ -38,6 +38,19 @@
   
   public static final String DIGEST_KEY = "nutch.content.digest";
 
+  public class InputFormat extends SequenceFileInputFormat {
+    /** Don't split inputs, to keep things polite. */
+    public FileSplit[] getSplits(NutchFileSystem fs, JobConf job, int nSplits)
+      throws IOException {
+      File[] files = listFiles(fs, job);
+      FileSplit[] splits = new FileSplit[files.length];
+      for (int i = 0; i < files.length; i++) {
+        splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]));
+      }
+      return splits;
+    }
+  }
+
   private RecordReader input;
   private OutputCollector output;