You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/19 11:34:24 UTC

svn commit: r638779 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/tools/ src/java/org/apache/nutch/tools/ar...

Author: ab
Date: Wed Mar 19 03:34:14 2008
New Revision: 638779

URL: http://svn.apache.org/viewvc?rev=638779&view=rev
Log:
NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Mar 19 03:34:14 2008
@@ -239,6 +239,9 @@
 
 87. NUTCH-223 - Crawl.java uses Integer.MAX_VALUE (Jeff Ritchie via ab)
 
+88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API.
+    (Emmanuel Joke, dogacan, ab)
+
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Mar 19 03:34:14 2008
@@ -28,8 +28,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.LockUtil;
@@ -40,7 +39,7 @@
  * This class takes the output of the fetcher and updates the
  * crawldb accordingly.
  */
-public class CrawlDb extends ToolBase {
+public class CrawlDb extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(CrawlDb.class);
 
   public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
@@ -48,11 +47,8 @@
   public static final String CURRENT_NAME = "current";
   
   public static final String LOCK_NAME = ".locked";
-
   
-  public CrawlDb() {
-    
-  }
+  public CrawlDb() {}
   
   public CrawlDb(Configuration conf) {
     setConf(conf);
@@ -150,7 +146,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new CrawlDb().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
     System.exit(res);
   }
 
@@ -182,8 +178,8 @@
       } else if (args[i].equals("-noAdditions")) {
         additionsAllowed = false;
       } else if (args[i].equals("-dir")) {
-        Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
-        dirs.addAll(Arrays.asList(paths));
+        FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+        dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
       } else {
         dirs.add(new Path(args[i]));
       }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Mar 19 03:34:14 2008
@@ -28,10 +28,9 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
@@ -50,10 +49,10 @@
  * 
  * @author Andrzej Bialecki
  */
-public class CrawlDbMerger extends ToolBase {
+public class CrawlDbMerger extends Configured implements Tool {
   private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
 
-  public static class Merger extends MapReduceBase implements Reducer {
+  public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
     MapWritable meta = new MapWritable();
     private FetchSchedule schedule;
 
@@ -63,13 +62,13 @@
       schedule = FetchScheduleFactory.getFetchSchedule(conf);
     }
 
-    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
+    public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter)
             throws IOException {
       CrawlDatum res = null;
       long resTime = 0L;
       meta.clear();
       while (values.hasNext()) {
-        CrawlDatum val = (CrawlDatum) values.next();
+        CrawlDatum val = values.next();
         if (res == null) {
           res = val;
           resTime = schedule.calculateLastFetchTime(res);
@@ -138,7 +137,7 @@
    * @param args
    */
   public static void main(String[] args) throws Exception {
-    int res = new CrawlDbMerger().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Mar 19 03:34:14 2008
@@ -29,8 +29,7 @@
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 
@@ -45,7 +44,7 @@
 import org.apache.nutch.util.NutchJob;
 
 /** Generates a subset of a crawl db to fetch. */
-public class Generator extends ToolBase {
+public class Generator extends Configured implements Tool {
 
   public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
   public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
@@ -81,7 +80,7 @@
   }
 
   /** Selects entries due for fetch. */
-  public static class Selector implements Mapper, Partitioner, Reducer {
+  public static class Selector implements Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>, Partitioner<FloatWritable, Writable>, Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
     private LongWritable genTime = new LongWritable(System.currentTimeMillis());
     private long curTime;
     private long limit;
@@ -89,7 +88,7 @@
     private HashMap<String, IntWritable> hostCounts =
       new HashMap<String, IntWritable>();
     private int maxPerHost;
-    private Partitioner hostPartitioner = new PartitionUrlByHost();
+    private Partitioner<Text, Writable> hostPartitioner = new PartitionUrlByHost();
     private URLFilters filters;
     private URLNormalizers normalizers;
     private ScoringFilters scfilters;
@@ -120,10 +119,10 @@
     public void close() {}
 
     /** Select & invert subset due for fetch. */
-    public void map(WritableComparable key, Writable value,
-                    OutputCollector output, Reporter reporter)
+    public void map(Text key, CrawlDatum value,
+                    OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
       throws IOException {
-      Text url = (Text)key;
+      Text url = key;
       if (filter) {
         // If filtering is on don't generate URLs that don't pass URLFilters
         try {
@@ -136,7 +135,7 @@
           }
         }
       }
-      CrawlDatum crawlDatum = (CrawlDatum)value;
+      CrawlDatum crawlDatum = value;
 
       // check fetch schedule
       if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
@@ -167,20 +166,21 @@
     }
 
     /** Partition by host. */
-    public int getPartition(WritableComparable key, Writable value,
+    public int getPartition(FloatWritable key, Writable value,
                             int numReduceTasks) {
       return hostPartitioner.getPartition(((SelectorEntry)value).url, key,
                                           numReduceTasks);
     }
 
     /** Collect until limit is reached. */
-    public void reduce(WritableComparable key, Iterator values,
-                       OutputCollector output, Reporter reporter)
+    public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
+                       OutputCollector<FloatWritable, SelectorEntry> output,
+                       Reporter reporter)
       throws IOException {
 
       while (values.hasNext() && count < limit) {
 
-        SelectorEntry entry = (SelectorEntry)values.next();
+        SelectorEntry entry = values.next();
         Text url = entry.url;        
         String urlString = url.toString();        
         URL u = null;
@@ -268,22 +268,23 @@
     }
   }
 
-  public static class SelectorInverseMapper extends MapReduceBase implements Mapper {
+  public static class SelectorInverseMapper extends MapReduceBase implements Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
 
-    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+    public void map(FloatWritable key, SelectorEntry value, OutputCollector<Text, SelectorEntry> output, Reporter reporter) throws IOException {
       SelectorEntry entry = (SelectorEntry)value;
       output.collect(entry.url, entry);
     }
   }
   
-  public static class PartitionReducer extends MapReduceBase implements Reducer {
+  public static class PartitionReducer extends MapReduceBase
+      implements Reducer<Text, SelectorEntry, Text, CrawlDatum> {
 
-    public void reduce(WritableComparable key, Iterator values,
-        OutputCollector output, Reporter reporter) throws IOException {
+    public void reduce(Text key, Iterator<SelectorEntry> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
       // if using HashComparator, we get only one input key in case of hash collision
       // so use only URLs from values
       while (values.hasNext()) {
-        SelectorEntry entry = (SelectorEntry)values.next();
+        SelectorEntry entry = values.next();
         output.collect(entry.url, entry.datum);
       }
     }
@@ -323,27 +324,27 @@
   /**
    * Update the CrawlDB so that the next generate won't include the same URLs.
    */
-  public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
+  public static class CrawlDbUpdater extends MapReduceBase implements Mapper<WritableComparable, Writable, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> {
     long generateTime;
     
     public void configure(JobConf job) {
       generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
     }
     
-    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+    public void map(WritableComparable key, Writable value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
       if (key instanceof FloatWritable) { // tempDir source
         SelectorEntry se = (SelectorEntry)value;
         output.collect(se.url, se.datum);
       } else {
-        output.collect(key, value);
+        output.collect((Text)key, (CrawlDatum)value);
       }
     }
 
-    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+    public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
       CrawlDatum orig = null;
       LongWritable genTime = null;
       while (values.hasNext()) {
-        CrawlDatum val = (CrawlDatum)values.next();
+        CrawlDatum val = values.next();
         if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
           genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
           if (genTime.get() != generateTime) {
@@ -359,13 +360,10 @@
         orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
       }
       output.collect(key, orig);
-    }
-    
+    }    
   }
   
-  public Generator() {
-    
-  }
+  public Generator() {}
   
   public Generator(Configuration conf) {
     setConf(conf);
@@ -564,7 +562,7 @@
    * Generate a fetchlist from the crawldb.
    */
   public static void main(String args[]) throws Exception {
-    int res = new Generator().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Mar 19 03:34:14 2008
@@ -28,8 +28,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 
 import org.apache.nutch.net.*;
 import org.apache.nutch.scoring.ScoringFilterException;
@@ -39,12 +38,12 @@
 
 /** This class takes a flat file of URLs and adds them to the of pages to be
  * crawled.  Useful for bootstrapping the system. */
-public class Injector extends ToolBase {
+public class Injector extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(Injector.class);
 
 
   /** Normalize and filter injected urls. */
-  public static class InjectMapper implements Mapper {
+  public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
     private URLNormalizers urlNormalizers;
     private int interval;
     private float scoreInjected;
@@ -65,12 +64,10 @@
 
     public void close() {}
 
-    public void map(WritableComparable key, Writable val,
-                    OutputCollector output, Reporter reporter)
+    public void map(WritableComparable key, Text value,
+                    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
       throws IOException {
-      Text value = (Text)val;
       String url = value.toString();              // value is line of text
-      // System.out.println("url: " +url);
       try {
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
         url = filters.filter(url);             // filter the url
@@ -98,17 +95,17 @@
   }
 
   /** Combine multiple new entries for a url. */
-  public static class InjectReducer implements Reducer {
+  public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
     public void configure(JobConf job) {}    
     public void close() {}
 
-    public void reduce(WritableComparable key, Iterator values,
-                       OutputCollector output, Reporter reporter)
+    public void reduce(Text key, Iterator<CrawlDatum> values,
+                       OutputCollector<Text, CrawlDatum> output, Reporter reporter)
       throws IOException {
       CrawlDatum old = null;
       CrawlDatum injected = null;
       while (values.hasNext()) {
-        CrawlDatum val = (CrawlDatum)values.next();
+        CrawlDatum val = values.next();
         if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
           injected = val;
           injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
@@ -124,9 +121,7 @@
     }
   }
 
-  public Injector() {
-    
-  }
+  public Injector() {}
   
   public Injector(Configuration conf) {
     setConf(conf);
@@ -179,7 +174,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new Injector().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Mar 19 03:34:14 2008
@@ -30,8 +30,7 @@
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
@@ -42,7 +41,7 @@
 import org.apache.nutch.util.NutchJob;
 
 /** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends ToolBase implements Mapper {
+public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
 
   public static final Log LOG = LogFactory.getLog(LinkDb.class);
 
@@ -54,9 +53,7 @@
   private URLFilters urlFilters;
   private URLNormalizers urlNormalizers;
   
-  public LinkDb() {
-    
-  }
+  public LinkDb() {}
   
   public LinkDb(Configuration conf) {
     setConf(conf);
@@ -75,8 +72,8 @@
 
   public void close() {}
 
-  public void map(WritableComparable key, Writable value,
-                  OutputCollector output, Reporter reporter)
+  public void map(Text key, ParseData parseData,
+                  OutputCollector<Text, Inlinks> output, Reporter reporter)
     throws IOException {
     String fromUrl = key.toString();
     String fromHost = getHost(fromUrl);
@@ -97,7 +94,6 @@
       }
     }
     if (fromUrl == null) return; // discard all outlinks
-    ParseData parseData = (ParseData)value;
     Outlink[] outlinks = parseData.getOutlinks();
     Inlinks inlinks = new Inlinks();
     for (int i = 0; i < outlinks.length; i++) {
@@ -147,8 +143,8 @@
 
   public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
     final FileSystem fs = FileSystem.get(getConf());
-    Path[] files = fs.listPaths(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
-    invert(linkDb, files, normalize, filter, force);
+    FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+    invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
   }
 
   public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
@@ -249,7 +245,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new LinkDb().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args);
     System.exit(res);
   }
   
@@ -265,7 +261,7 @@
       return -1;
     }
     Path segDir = null;
-    final FileSystem fs = FileSystem.get(conf);
+    final FileSystem fs = FileSystem.get(getConf());
     Path db = new Path(args[0]);
     ArrayList<Path> segs = new ArrayList<Path>();
     boolean filter = true;
@@ -274,15 +270,8 @@
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-dir")) {
         segDir = new Path(args[++i]);
-        Path[] files = fs.listPaths(segDir, new PathFilter() {
-          public boolean accept(Path f) {
-            try {
-              if (fs.getFileStatus(f).isDir()) return true;
-            } catch (IOException ioe) {};
-            return false;
-          }
-        });
-        if (files != null) segs.addAll(Arrays.asList(files));
+        FileStatus[] files = fs.listStatus(segDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+        if (files != null) segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(files)));
         break;
       } else if (args[i].equalsIgnoreCase("-noNormalize")) {
         normalize = false;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Wed Mar 19 03:34:14 2008
@@ -24,10 +24,10 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.MapFileOutputFormat;
@@ -36,7 +36,8 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 
@@ -58,7 +59,7 @@
  * 
  * @author Andrzej Bialecki
  */
-public class LinkDbMerger extends ToolBase implements Reducer {
+public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> {
   private static final Log LOG = LogFactory.getLog(LinkDbMerger.class);
   
   private int maxInlinks;
@@ -71,12 +72,12 @@
     setConf(conf);
   }
 
-  public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+  public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
 
     Inlinks result = new Inlinks();
 
     while (values.hasNext()) {
-      Inlinks inlinks = (Inlinks)values.next();
+      Inlinks inlinks = values.next();
 
       int end = Math.min(maxInlinks - result.size(), inlinks.size());
       Iterator<Inlink> it = inlinks.iterator();
@@ -135,7 +136,7 @@
    * @param args
    */
   public static void main(String[] args) throws Exception {
-    int res = new LinkDbMerger().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Wed Mar 19 03:34:14 2008
@@ -23,12 +23,12 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.mapred.lib.HashPartitioner;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 import org.apache.hadoop.conf.Configuration;
 
 import org.apache.nutch.util.NutchConfiguration;
@@ -37,10 +37,10 @@
 import java.util.Iterator;
 
 /** . */
-public class LinkDbReader extends ToolBase implements Closeable {
+public class LinkDbReader extends Configured implements Tool, Closeable {
   public static final Log LOG = LogFactory.getLog(LinkDbReader.class);
 
-  private static final Partitioner PARTITIONER = new HashPartitioner();
+  private static final Partitioner<WritableComparable, Writable> PARTITIONER = new HashPartitioner<WritableComparable, Writable>();
 
   private FileSystem fs;
   private Path directory;
@@ -111,7 +111,7 @@
   }
   
   public static void main(String[] args) throws Exception {
-    int res = new LinkDbReader().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Wed Mar 19 03:34:14 2008
@@ -27,7 +27,7 @@
 import org.apache.nutch.net.URLNormalizers;
 
 /** Partition urls by hostname. */
-public class PartitionUrlByHost implements Partitioner {
+public class PartitionUrlByHost implements Partitioner<Text, Writable> {
   private static final Log LOG = LogFactory.getLog(PartitionUrlByHost.class);
   
   private int seed;
@@ -41,9 +41,9 @@
   public void close() {}
 
   /** Hash by hostname. */
-  public int getPartition(WritableComparable key, Writable value,
+  public int getPartition(Text key, Writable value,
                           int numReduceTasks) {
-    String urlString = ((Text)key).toString();
+    String urlString = key.toString();
     try {
       urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
     } catch (Exception e) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Mar 19 03:34:14 2008
@@ -29,8 +29,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
@@ -45,7 +44,7 @@
 
 
 /** The fetcher. Most of the work is done by plugins. */
-public class Fetcher extends ToolBase implements MapRunnable { 
+public class Fetcher extends Configured implements Tool, MapRunnable<WritableComparable, Writable, Text, NutchWritable> { 
 
   public static final Log LOG = LogFactory.getLog(Fetcher.class);
   
@@ -55,7 +54,7 @@
 
   public static final String PROTOCOL_REDIR = "protocol";
 
-  public static class InputFormat extends SequenceFileInputFormat {
+  public static class InputFormat extends SequenceFileInputFormat<WritableComparable, Writable> {
     /** Don't split inputs, to keep things polite. */
     public InputSplit[] getSplits(JobConf job, int nSplits)
       throws IOException {
@@ -69,8 +68,8 @@
     }
   }
 
-  private RecordReader input;
-  private OutputCollector output;
+  private RecordReader<WritableComparable, Writable> input;
+  private OutputCollector<Text, NutchWritable> output;
   private Reporter reporter;
 
   private String segmentName;
@@ -455,7 +454,7 @@
     return conf.getBoolean("fetcher.store.content", true);
   }
 
-  public void run(RecordReader input, OutputCollector output,
+  public void run(RecordReader<WritableComparable, Writable> input, OutputCollector<Text, NutchWritable> output,
                   Reporter reporter) throws IOException {
 
     this.input = input;
@@ -529,7 +528,7 @@
 
   /** Run the fetcher. */
   public static void main(String[] args) throws Exception {
-    int res = new Fetcher().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Mar 19 03:34:14 2008
@@ -28,9 +28,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -63,8 +61,8 @@
  * 
  * @author Andrzej Bialecki
  */
-public class DeleteDuplicates extends ToolBase
-  implements Mapper, Reducer, OutputFormat {
+public class DeleteDuplicates extends Configured
+  implements Tool, Mapper<WritableComparable, Writable, Text, IntWritable>, Reducer<Text, IntWritable, WritableComparable, Writable>, OutputFormat<WritableComparable, Writable> {
   private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
 
 //   Algorithm:
@@ -141,7 +139,7 @@
 
   }
 
-  public static class InputFormat extends InputFormatBase {
+  public static class InputFormat extends FileInputFormat<Text, IndexDoc> {
     private static final long INDEX_LENGTH = Integer.MAX_VALUE;
 
     /** Return each index as a split. */
@@ -155,7 +153,7 @@
       return splits;
     }
 
-    public class DDRecordReader implements RecordReader {
+    public class DDRecordReader implements RecordReader<Text, IndexDoc> {
 
       private IndexReader indexReader;
       private int maxDoc = 0;
@@ -174,7 +172,7 @@
         this.index = index;
       }
 
-      public boolean next(WritableComparable key, Writable value)
+      public boolean next(Text key, IndexDoc indexDoc)
         throws IOException {
         
         // skip empty indexes
@@ -189,9 +187,8 @@
         Document document = indexReader.document(doc);
 
         // fill in key
-        ((Text)key).set(document.get("url"));
+        key.set(document.get("url"));
         // fill in value
-        IndexDoc indexDoc = (IndexDoc)value;
         indexDoc.keep = true;
         indexDoc.url.set(document.get("url"));
         indexDoc.hash.setDigest(document.get("digest"));
@@ -226,11 +223,11 @@
         indexReader.close();
       }
       
-      public WritableComparable createKey() {
+      public Text createKey() {
         return new Text();
       }
       
-      public Writable createValue() {
+      public IndexDoc createValue() {
         return new IndexDoc();
       }
 
@@ -240,7 +237,7 @@
     }
     
     /** Return each index as a split. */
-    public RecordReader getRecordReader(InputSplit split,
+    public RecordReader<Text, IndexDoc> getRecordReader(InputSplit split,
                                         JobConf job,
                                         Reporter reporter) throws IOException {
       FileSplit fsplit = (FileSplit)split;
@@ -250,27 +247,27 @@
     }
   }
   
-  public static class HashPartitioner implements Partitioner {
+  public static class HashPartitioner implements Partitioner<MD5Hash, Writable> {
     public void configure(JobConf job) {}
     public void close() {}
-    public int getPartition(WritableComparable key, Writable value,
+    public int getPartition(MD5Hash key, Writable value,
                             int numReduceTasks) {
-      int hashCode = ((MD5Hash)key).hashCode();
+      int hashCode = key.hashCode();
       return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
     }
   }
 
-  public static class UrlsReducer implements Reducer {
+  public static class UrlsReducer implements Reducer<Text, IndexDoc, MD5Hash, IndexDoc> {
     
     public void configure(JobConf job) {}
     
     public void close() {}
     
-    public void reduce(WritableComparable key, Iterator values,
-        OutputCollector output, Reporter reporter) throws IOException {
+    public void reduce(Text key, Iterator<IndexDoc> values,
+        OutputCollector<MD5Hash, IndexDoc> output, Reporter reporter) throws IOException {
       IndexDoc latest = null;
       while (values.hasNext()) {
-        IndexDoc value = (IndexDoc)values.next();
+        IndexDoc value = values.next();
         if (latest == null) {
           latest = value;
           continue;
@@ -296,7 +293,7 @@
     }
   }
   
-  public static class HashReducer implements Reducer {
+  public static class HashReducer implements Reducer<MD5Hash, IndexDoc, Text, IndexDoc> {
     boolean byScore;
     
     public void configure(JobConf job) {
@@ -304,12 +301,12 @@
     }
     
     public void close() {}
-    public void reduce(WritableComparable key, Iterator values,
-                       OutputCollector output, Reporter reporter)
+    public void reduce(MD5Hash key, Iterator<IndexDoc> values,
+                       OutputCollector<Text, IndexDoc> output, Reporter reporter)
       throws IOException {
       IndexDoc highest = null;
       while (values.hasNext()) {
-        IndexDoc value = (IndexDoc)values.next();
+        IndexDoc value = values.next();
         // skip already deleted
         if (!value.keep) {
           LOG.debug("-discard " + value + " (already marked)");
@@ -355,7 +352,7 @@
   public void setConf(Configuration conf) {
     super.setConf(conf);
     try {
-      fs = FileSystem.get(conf);
+      if(conf != null) fs = FileSystem.get(conf);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }
@@ -365,7 +362,7 @@
 
   /** Map [*,IndexDoc] pairs to [index,doc] pairs. */
   public void map(WritableComparable key, Writable value,
-                  OutputCollector output, Reporter reporter)
+                  OutputCollector<Text, IntWritable> output, Reporter reporter)
     throws IOException {
     IndexDoc indexDoc = (IndexDoc)value;
     // don't delete these
@@ -375,14 +372,14 @@
   }
 
   /** Delete docs named in values from index named in key. */
-  public void reduce(WritableComparable key, Iterator values,
-                     OutputCollector output, Reporter reporter)
+  public void reduce(Text key, Iterator<IntWritable> values,
+                     OutputCollector<WritableComparable, Writable> output, Reporter reporter)
     throws IOException {
     Path index = new Path(key.toString());
     IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
     try {
       while (values.hasNext()) {
-        IntWritable value = (IntWritable)values.next();
+        IntWritable value = values.next();
         LOG.debug("-delete " + index + " doc=" + value);
         reader.deleteDocument(value.get());
       }
@@ -392,11 +389,11 @@
   }
 
   /** Write nothing. */
-  public RecordWriter getRecordWriter(final FileSystem fs,
+  public RecordWriter<WritableComparable, Writable> getRecordWriter(final FileSystem fs,
                                       final JobConf job,
                                       final String name,
                                       final Progressable progress) throws IOException {
-    return new RecordWriter() {                   
+    return new RecordWriter<WritableComparable, Writable>() {                   
         public void write(WritableComparable key, Writable value)
           throws IOException {
           throw new UnsupportedOperationException();
@@ -496,7 +493,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new DeleteDuplicates().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new DeleteDuplicates(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Wed Mar 19 03:34:14 2008
@@ -25,8 +25,7 @@
 
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.mapred.FileAlreadyExistsException;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 import org.apache.hadoop.conf.*;
 
 import org.apache.nutch.util.HadoopFSUtil;
@@ -43,7 +42,7 @@
  * @author Doug Cutting
  * @author Mike Cafarella
  *************************************************************************/
-public class IndexMerger extends ToolBase {
+public class IndexMerger extends Configured implements Tool {
   public static final Log LOG = LogFactory.getLog(IndexMerger.class);
 
   public static final String DONE_NAME = "merge.done";
@@ -81,17 +80,17 @@
     Directory[] dirs = new Directory[indexes.length];
     for (int i = 0; i < indexes.length; i++) {
       if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); }
-      dirs[i] = new FsDirectory(fs, indexes[i], false, this.conf);
+      dirs[i] = new FsDirectory(fs, indexes[i], false, getConf());
     }
 
     //
     // Merge indices
     //
     IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
-    writer.setMergeFactor(conf.getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
-    writer.setMaxBufferedDocs(conf.getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
-    writer.setMaxMergeDocs(conf.getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
-    writer.setTermIndexInterval(conf.getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
+    writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+    writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
+    writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+    writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
     writer.setInfoStream(LogUtil.getDebugStream(LOG));
     writer.setUseCompoundFile(false);
     writer.setSimilarity(new NutchSimilarity());
@@ -109,7 +108,7 @@
    * Create an index for the input files in the named directory. 
    */
   public static void main(String[] args) throws Exception {
-    int res = new IndexMerger().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args);
     System.exit(res);
   }
   
@@ -123,7 +122,7 @@
     //
     // Parse args, read all index directories to be processed
     //
-    FileSystem fs = FileSystem.get(conf);
+    FileSystem fs = FileSystem.get(getConf());
     List<Path> indexDirs = new ArrayList<Path>();
 
     Path workDir = new Path("indexmerger-" + System.currentTimeMillis());  
@@ -152,7 +151,7 @@
       LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e));
       return -1;
     } finally {
-      FileSystem.getLocal(conf).delete(workDir);
+      FileSystem.getLocal(getConf()).delete(workDir);
     }
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Wed Mar 19 03:34:14 2008
@@ -32,12 +32,12 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.*;
 
 /** Sort a Nutch index by page score.  Higher scoring documents are assigned
  * smaller document numbers. */
-public class IndexSorter extends ToolBase {
+public class IndexSorter extends Configured implements Tool {
   private static final Log LOG = LogFactory.getLog(IndexSorter.class);
   
   private static class PostingMap implements Comparable<PostingMap> {
@@ -300,7 +300,7 @@
 
   /** */
   public static void main(String[] args) throws Exception {
-    int res = new IndexSorter().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new IndexSorter(), args);
     System.exit(res);
   }
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Mar 19 03:34:14 2008
@@ -27,9 +27,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
 import org.apache.nutch.parse.*;
 import org.apache.nutch.analysis.*;
 
@@ -51,7 +49,7 @@
 import org.apache.nutch.metadata.Nutch;
 
 /** Create indexes for segments. */
-public class Indexer extends ToolBase implements Reducer, Mapper {
+public class Indexer extends Configured implements Tool, Reducer<Text, NutchWritable, Text, Writable>, Mapper<Text, Writable, Text, NutchWritable> {
   
   public static final String DONE_NAME = "index.done";
 
@@ -85,8 +83,8 @@
 
   /** Unwrap Lucene Documents created by reduce and add them to an index. */
   public static class OutputFormat
-    extends org.apache.hadoop.mapred.OutputFormatBase {
-    public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
+    extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> {
+    public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job,
                                         String name, final Progressable progress) throws IOException {
       final Path perm = new Path(job.getOutputPath(), name);
       final Path temp =
@@ -109,12 +107,12 @@
       writer.setUseCompoundFile(false);
       writer.setSimilarity(new NutchSimilarity());
 
-      return new RecordWriter() {
+      return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
           boolean closed;
 
-          public void write(WritableComparable key, Writable value)
+          public void write(WritableComparable key, LuceneDocumentWrapper value)
             throws IOException {                  // unwrap & index doc
-            Document doc = ((LuceneDocumentWrapper) value).get();
+            Document doc = value.get();
             NutchAnalyzer analyzer = factory.get(doc.get("lang"));
             if (LOG.isInfoEnabled()) {
               LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
@@ -174,8 +172,8 @@
 
   public void close() {}
 
-  public void reduce(WritableComparable key, Iterator values,
-                     OutputCollector output, Reporter reporter)
+  public void reduce(Text key, Iterator<NutchWritable> values,
+                     OutputCollector<Text, Writable> output, Reporter reporter)
     throws IOException {
     Inlinks inlinks = null;
     CrawlDatum dbDatum = null;
@@ -183,7 +181,7 @@
     ParseData parseData = null;
     ParseText parseText = null;
     while (values.hasNext()) {
-      Writable value = ((NutchWritable)values.next()).get(); // unwrap
+      Writable value = values.next().get(); // unwrap
       if (value instanceof Inlinks) {
         inlinks = (Inlinks)value;
       } else if (value instanceof CrawlDatum) {
@@ -248,7 +246,7 @@
         fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
       }
       // run indexing filters
-      doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
+      doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
     } catch (IndexingException e) {
       if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
       return;
@@ -315,7 +313,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new Indexer().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args);
     System.exit(res);
   }
   
@@ -341,8 +339,8 @@
     }
   }
 
-  public void map(WritableComparable key, Writable value,
-      OutputCollector output, Reporter reporter) throws IOException {
+  public void map(Text key, Writable value,
+      OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
     output.collect(key, new NutchWritable(value));
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Mar 19 03:34:14 2008
@@ -23,7 +23,7 @@
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.*;
 import org.apache.hadoop.conf.*;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.*;
@@ -37,7 +37,7 @@
 import java.util.Map.Entry;
 
 /* Parse content in a segment. */
-public class ParseSegment extends Configured implements Mapper, Reducer {
+public class ParseSegment extends Configured implements Tool, Mapper<WritableComparable, Content, Text, ParseImpl>, Reducer<Text, Writable, Text, Writable> {
 
   public static final Log LOG = LogFactory.getLog(Parser.class);
   
@@ -60,15 +60,14 @@
   
   private Text newKey = new Text();
 
-  public void map(WritableComparable key, Writable value,
-                  OutputCollector output, Reporter reporter)
+  public void map(WritableComparable key, Content content,
+                  OutputCollector<Text, ParseImpl> output, Reporter reporter)
     throws IOException {
     // convert on the fly from old UTF8 keys
     if (key instanceof UTF8) {
       newKey.set(key.toString());
       key = newKey;
     }
-    Content content = (Content) value;
 
     ParseResult parseResult = null;
     try {
@@ -111,8 +110,8 @@
     }
   }
 
-  public void reduce(WritableComparable key, Iterator values,
-                     OutputCollector output, Reporter reporter)
+  public void reduce(Text key, Iterator<Writable> values,
+                     OutputCollector<Text, Writable> output, Reporter reporter)
     throws IOException {
     output.collect(key, (Writable)values.next()); // collect first value
   }
@@ -144,6 +143,11 @@
 
 
   public static void main(String[] args) throws Exception {
+	int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args);
+	System.exit(res);
+  }
+	  
+  public int run(String[] args) throws Exception {
     Path segment;
 
     String usage = "Usage: ParseSegment segment";
@@ -151,11 +155,9 @@
     if (args.length == 0) {
       System.err.println(usage);
       System.exit(-1);
-    }
-      
+    }      
     segment = new Path(args[0]);
-
-    ParseSegment parseSegment = new ParseSegment(NutchConfiguration.create());
-    parseSegment.parse(segment);
+    parse(segment);
+    return 0;
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Wed Mar 19 03:34:14 2008
@@ -25,8 +25,8 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
@@ -38,7 +38,8 @@
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.TextInputFormat;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.PartitionUrlByHost;
@@ -55,13 +56,15 @@
  * 
  * @author Andrzej Bialecki
  */
-public class FreeGenerator extends ToolBase {
+public class FreeGenerator extends Configured implements Tool {
   private static final Log LOG = LogFactory.getLog(FreeGenerator.class);
   
   private static final String FILTER_KEY = "free.generator.filter";
   private static final String NORMALIZE_KEY = "free.generator.normalize";
 
-  public static class FG extends MapReduceBase implements Mapper, Reducer {
+  public static class FG extends MapReduceBase
+  implements Mapper<WritableComparable, Text, Text, Generator.SelectorEntry>,
+  Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
     private URLNormalizers normalizers = null;
     private URLFilters filters = null;
     private ScoringFilters scfilters;
@@ -82,7 +85,8 @@
     
     Generator.SelectorEntry entry = new Generator.SelectorEntry();
 
-    public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+    public void map(WritableComparable key, Text value, OutputCollector<Text,
+        Generator.SelectorEntry> output, Reporter reporter) throws IOException {
       // value is a line of text
       String urlString = value.toString();
       try {
@@ -111,7 +115,8 @@
       output.collect(url, entry);
     }
 
-    public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+    public void reduce(Text key, Iterator<Generator.SelectorEntry> values,
+        OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
       // pick unique urls from values - discard the reduce key due to hash collisions
       HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
       while (values.hasNext()) {
@@ -177,7 +182,7 @@
   }
 
   public static void main(String[] args) throws Exception {
-    int res = new FreeGenerator().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), args);
     System.exit(res);
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Wed Mar 19 03:34:14 2008
@@ -24,18 +24,18 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Mapper;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
@@ -65,11 +65,10 @@
  * <p>Arc files are tars of compressed gzips which are produced by both the
  * internet archive project and the grub distributed crawler project.</p>
  * 
- * TODO: This class needs to be changed to use ToolRunner instead of ToolBase.
  */
 public class ArcSegmentCreator
-  extends ToolBase
-  implements Mapper {
+  extends Configured
+  implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {
 
   public static final Log LOG = LogFactory.getLog(ArcSegmentCreator.class);
   public static final String URL_VERSION = "arc.url.version";
@@ -145,7 +144,7 @@
    * 
    * @return The result of the parse in a ParseStatus object.
    */
-  private ParseStatus output(OutputCollector output, String segmentName,
+  private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName,
     Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
     int status) {
 
@@ -184,7 +183,7 @@
       // set the content signature
       if (parseResult == null) {
         byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
-          content, new ParseStatus().getEmptyParse(conf));
+          content, new ParseStatus().getEmptyParse(getConf()));
         datum.setSignature(signature);
       }
 
@@ -266,12 +265,12 @@
    * segments.</p>
    * 
    * @param key The arc record header.
-   * @param value The arc record raw content bytes.
+   * @param bytes The arc record raw content bytes.
    * @param output The output collecter.
    * @param reporter The progress reporter.
    */
-  public void map(WritableComparable key, Writable value,
-    OutputCollector output, Reporter reporter)
+  public void map(Text key, BytesWritable bytes,
+    OutputCollector<Text, NutchWritable> output, Reporter reporter)
     throws IOException {
 
     String[] headers = key.toString().split("\\s+");
@@ -289,7 +288,6 @@
 
     // get the raw  bytes from the arc file, create a new crawldatum
     Text url = new Text();
-    BytesWritable bytes = (BytesWritable)value;
     CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
       1.0f);
     String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);
@@ -371,7 +369,7 @@
 
   public static void main(String args[])
     throws Exception {
-    int res = new ArcSegmentCreator().doMain(NutchConfiguration.create(), args);
+    int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args);
     System.exit(res);
   }
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Wed Mar 19 03:34:14 2008
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
@@ -49,6 +50,23 @@
             }
 
         };
+    }
+    
+    /**
+     * Turns an array of FileStatus into an array of Paths.
+     */
+    public static Path[] getPaths(FileStatus[] stats) {
+      if (stats == null) {
+        return null;
+      }
+      if (stats.length == 0) {
+        return new Path[0];
+      }
+      Path[] res = new Path[stats.length];
+      for (int i = 0; i < stats.length; i++) {
+        res[i] = stats[i].getPath();
+      }
+      return res;
     }
 
 }