You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2008/03/19 11:34:24 UTC
svn commit: r638779 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/tools/ src/java/org/apache/nutch/tools/ar...
Author: ab
Date: Wed Mar 19 03:34:14 2008
New Revision: 638779
URL: http://svn.apache.org/viewvc?rev=638779&view=rev
Log:
NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Mar 19 03:34:14 2008
@@ -239,6 +239,9 @@
87. NUTCH-223 - Crawl.java uses Integer.MAX_VALUE (Jeff Ritchie via ab)
+88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API.
+ (Emmanuel Joke, dogacan, ab)
+
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Mar 19 03:34:14 2008
@@ -28,8 +28,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
@@ -40,7 +39,7 @@
* This class takes the output of the fetcher and updates the
* crawldb accordingly.
*/
-public class CrawlDb extends ToolBase {
+public class CrawlDb extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(CrawlDb.class);
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
@@ -48,11 +47,8 @@
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
-
- public CrawlDb() {
-
- }
+ public CrawlDb() {}
public CrawlDb(Configuration conf) {
setConf(conf);
@@ -150,7 +146,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new CrawlDb().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
System.exit(res);
}
@@ -182,8 +178,8 @@
} else if (args[i].equals("-noAdditions")) {
additionsAllowed = false;
} else if (args[i].equals("-dir")) {
- Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
- dirs.addAll(Arrays.asList(paths));
+ FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
+ dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
} else {
dirs.add(new Path(args[i]));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Mar 19 03:34:14 2008
@@ -28,10 +28,9 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
+import org.apache.hadoop.conf.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -50,10 +49,10 @@
*
* @author Andrzej Bialecki
*/
-public class CrawlDbMerger extends ToolBase {
+public class CrawlDbMerger extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
- public static class Merger extends MapReduceBase implements Reducer {
+ public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
MapWritable meta = new MapWritable();
private FetchSchedule schedule;
@@ -63,13 +62,13 @@
schedule = FetchScheduleFactory.getFetchSchedule(conf);
}
- public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
+ public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
CrawlDatum res = null;
long resTime = 0L;
meta.clear();
while (values.hasNext()) {
- CrawlDatum val = (CrawlDatum) values.next();
+ CrawlDatum val = values.next();
if (res == null) {
res = val;
resTime = schedule.calculateLastFetchTime(res);
@@ -138,7 +137,7 @@
* @param args
*/
public static void main(String[] args) throws Exception {
- int res = new CrawlDbMerger().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Mar 19 03:34:14 2008
@@ -29,8 +29,7 @@
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -45,7 +44,7 @@
import org.apache.nutch.util.NutchJob;
/** Generates a subset of a crawl db to fetch. */
-public class Generator extends ToolBase {
+public class Generator extends Configured implements Tool {
public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
@@ -81,7 +80,7 @@
}
/** Selects entries due for fetch. */
- public static class Selector implements Mapper, Partitioner, Reducer {
+ public static class Selector implements Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>, Partitioner<FloatWritable, Writable>, Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> {
private LongWritable genTime = new LongWritable(System.currentTimeMillis());
private long curTime;
private long limit;
@@ -89,7 +88,7 @@
private HashMap<String, IntWritable> hostCounts =
new HashMap<String, IntWritable>();
private int maxPerHost;
- private Partitioner hostPartitioner = new PartitionUrlByHost();
+ private Partitioner<Text, Writable> hostPartitioner = new PartitionUrlByHost();
private URLFilters filters;
private URLNormalizers normalizers;
private ScoringFilters scfilters;
@@ -120,10 +119,10 @@
public void close() {}
/** Select & invert subset due for fetch. */
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter)
+ public void map(Text key, CrawlDatum value,
+ OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter)
throws IOException {
- Text url = (Text)key;
+ Text url = key;
if (filter) {
// If filtering is on don't generate URLs that don't pass URLFilters
try {
@@ -136,7 +135,7 @@
}
}
}
- CrawlDatum crawlDatum = (CrawlDatum)value;
+ CrawlDatum crawlDatum = value;
// check fetch schedule
if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
@@ -167,20 +166,21 @@
}
/** Partition by host. */
- public int getPartition(WritableComparable key, Writable value,
+ public int getPartition(FloatWritable key, Writable value,
int numReduceTasks) {
return hostPartitioner.getPartition(((SelectorEntry)value).url, key,
numReduceTasks);
}
/** Collect until limit is reached. */
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
+ OutputCollector<FloatWritable, SelectorEntry> output,
+ Reporter reporter)
throws IOException {
while (values.hasNext() && count < limit) {
- SelectorEntry entry = (SelectorEntry)values.next();
+ SelectorEntry entry = values.next();
Text url = entry.url;
String urlString = url.toString();
URL u = null;
@@ -268,22 +268,23 @@
}
}
- public static class SelectorInverseMapper extends MapReduceBase implements Mapper {
+ public static class SelectorInverseMapper extends MapReduceBase implements Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> {
- public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+ public void map(FloatWritable key, SelectorEntry value, OutputCollector<Text, SelectorEntry> output, Reporter reporter) throws IOException {
SelectorEntry entry = (SelectorEntry)value;
output.collect(entry.url, entry);
}
}
- public static class PartitionReducer extends MapReduceBase implements Reducer {
+ public static class PartitionReducer extends MapReduceBase
+ implements Reducer<Text, SelectorEntry, Text, CrawlDatum> {
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<SelectorEntry> values,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
// if using HashComparator, we get only one input key in case of hash collision
// so use only URLs from values
while (values.hasNext()) {
- SelectorEntry entry = (SelectorEntry)values.next();
+ SelectorEntry entry = values.next();
output.collect(entry.url, entry.datum);
}
}
@@ -323,27 +324,27 @@
/**
* Update the CrawlDB so that the next generate won't include the same URLs.
*/
- public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
+ public static class CrawlDbUpdater extends MapReduceBase implements Mapper<WritableComparable, Writable, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> {
long generateTime;
public void configure(JobConf job) {
generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);
}
- public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+ public void map(WritableComparable key, Writable value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
if (key instanceof FloatWritable) { // tempDir source
SelectorEntry se = (SelectorEntry)value;
output.collect(se.url, se.datum);
} else {
- output.collect(key, value);
+ output.collect((Text)key, (CrawlDatum)value);
}
}
- public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
CrawlDatum orig = null;
LongWritable genTime = null;
while (values.hasNext()) {
- CrawlDatum val = (CrawlDatum)values.next();
+ CrawlDatum val = values.next();
if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
if (genTime.get() != generateTime) {
@@ -359,13 +360,10 @@
orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);
}
output.collect(key, orig);
- }
-
+ }
}
- public Generator() {
-
- }
+ public Generator() {}
public Generator(Configuration conf) {
setConf(conf);
@@ -564,7 +562,7 @@
* Generate a fetchlist from the crawldb.
*/
public static void main(String args[]) throws Exception {
- int res = new Generator().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Mar 19 03:34:14 2008
@@ -28,8 +28,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.net.*;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -39,12 +38,12 @@
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
-public class Injector extends ToolBase {
+public class Injector extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(Injector.class);
/** Normalize and filter injected urls. */
- public static class InjectMapper implements Mapper {
+ public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
private URLNormalizers urlNormalizers;
private int interval;
private float scoreInjected;
@@ -65,12 +64,10 @@
public void close() {}
- public void map(WritableComparable key, Writable val,
- OutputCollector output, Reporter reporter)
+ public void map(WritableComparable key, Text value,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
- Text value = (Text)val;
String url = value.toString(); // value is line of text
- // System.out.println("url: " +url);
try {
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
url = filters.filter(url); // filter the url
@@ -98,17 +95,17 @@
}
/** Combine multiple new entries for a url. */
- public static class InjectReducer implements Reducer {
+ public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
public void configure(JobConf job) {}
public void close() {}
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(Text key, Iterator<CrawlDatum> values,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
CrawlDatum old = null;
CrawlDatum injected = null;
while (values.hasNext()) {
- CrawlDatum val = (CrawlDatum)values.next();
+ CrawlDatum val = values.next();
if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
injected = val;
injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
@@ -124,9 +121,7 @@
}
}
- public Injector() {
-
- }
+ public Injector() {}
public Injector(Configuration conf) {
setConf(conf);
@@ -179,7 +174,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new Injector().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Mar 19 03:34:14 2008
@@ -30,8 +30,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
@@ -42,7 +41,7 @@
import org.apache.nutch.util.NutchJob;
/** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends ToolBase implements Mapper {
+public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
public static final Log LOG = LogFactory.getLog(LinkDb.class);
@@ -54,9 +53,7 @@
private URLFilters urlFilters;
private URLNormalizers urlNormalizers;
- public LinkDb() {
-
- }
+ public LinkDb() {}
public LinkDb(Configuration conf) {
setConf(conf);
@@ -75,8 +72,8 @@
public void close() {}
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter)
+ public void map(Text key, ParseData parseData,
+ OutputCollector<Text, Inlinks> output, Reporter reporter)
throws IOException {
String fromUrl = key.toString();
String fromHost = getHost(fromUrl);
@@ -97,7 +94,6 @@
}
}
if (fromUrl == null) return; // discard all outlinks
- ParseData parseData = (ParseData)value;
Outlink[] outlinks = parseData.getOutlinks();
Inlinks inlinks = new Inlinks();
for (int i = 0; i < outlinks.length; i++) {
@@ -147,8 +143,8 @@
public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException {
final FileSystem fs = FileSystem.get(getConf());
- Path[] files = fs.listPaths(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
- invert(linkDb, files, normalize, filter, force);
+ FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+ invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force);
}
public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException {
@@ -249,7 +245,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new LinkDb().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args);
System.exit(res);
}
@@ -265,7 +261,7 @@
return -1;
}
Path segDir = null;
- final FileSystem fs = FileSystem.get(conf);
+ final FileSystem fs = FileSystem.get(getConf());
Path db = new Path(args[0]);
ArrayList<Path> segs = new ArrayList<Path>();
boolean filter = true;
@@ -274,15 +270,8 @@
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-dir")) {
segDir = new Path(args[++i]);
- Path[] files = fs.listPaths(segDir, new PathFilter() {
- public boolean accept(Path f) {
- try {
- if (fs.getFileStatus(f).isDir()) return true;
- } catch (IOException ioe) {};
- return false;
- }
- });
- if (files != null) segs.addAll(Arrays.asList(files));
+ FileStatus[] files = fs.listStatus(segDir, HadoopFSUtil.getPassDirectoriesFilter(fs));
+ if (files != null) segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(files)));
break;
} else if (args[i].equalsIgnoreCase("-noNormalize")) {
normalize = false;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Wed Mar 19 03:34:14 2008
@@ -24,10 +24,10 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
@@ -36,7 +36,8 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -58,7 +59,7 @@
*
* @author Andrzej Bialecki
*/
-public class LinkDbMerger extends ToolBase implements Reducer {
+public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> {
private static final Log LOG = LogFactory.getLog(LinkDbMerger.class);
private int maxInlinks;
@@ -71,12 +72,12 @@
setConf(conf);
}
- public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException {
Inlinks result = new Inlinks();
while (values.hasNext()) {
- Inlinks inlinks = (Inlinks)values.next();
+ Inlinks inlinks = values.next();
int end = Math.min(maxInlinks - result.size(), inlinks.size());
Iterator<Inlink> it = inlinks.iterator();
@@ -135,7 +136,7 @@
* @param args
*/
public static void main(String[] args) throws Exception {
- int res = new LinkDbMerger().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Wed Mar 19 03:34:14 2008
@@ -23,12 +23,12 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.HashPartitioner;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
@@ -37,10 +37,10 @@
import java.util.Iterator;
/** . */
-public class LinkDbReader extends ToolBase implements Closeable {
+public class LinkDbReader extends Configured implements Tool, Closeable {
public static final Log LOG = LogFactory.getLog(LinkDbReader.class);
- private static final Partitioner PARTITIONER = new HashPartitioner();
+ private static final Partitioner<WritableComparable, Writable> PARTITIONER = new HashPartitioner<WritableComparable, Writable>();
private FileSystem fs;
private Path directory;
@@ -111,7 +111,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new LinkDbReader().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Wed Mar 19 03:34:14 2008
@@ -27,7 +27,7 @@
import org.apache.nutch.net.URLNormalizers;
/** Partition urls by hostname. */
-public class PartitionUrlByHost implements Partitioner {
+public class PartitionUrlByHost implements Partitioner<Text, Writable> {
private static final Log LOG = LogFactory.getLog(PartitionUrlByHost.class);
private int seed;
@@ -41,9 +41,9 @@
public void close() {}
/** Hash by hostname. */
- public int getPartition(WritableComparable key, Writable value,
+ public int getPartition(Text key, Writable value,
int numReduceTasks) {
- String urlString = ((Text)key).toString();
+ String urlString = key.toString();
try {
urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
} catch (Exception e) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Mar 19 03:34:14 2008
@@ -29,8 +29,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
@@ -45,7 +44,7 @@
/** The fetcher. Most of the work is done by plugins. */
-public class Fetcher extends ToolBase implements MapRunnable {
+public class Fetcher extends Configured implements Tool, MapRunnable<WritableComparable, Writable, Text, NutchWritable> {
public static final Log LOG = LogFactory.getLog(Fetcher.class);
@@ -55,7 +54,7 @@
public static final String PROTOCOL_REDIR = "protocol";
- public static class InputFormat extends SequenceFileInputFormat {
+ public static class InputFormat extends SequenceFileInputFormat<WritableComparable, Writable> {
/** Don't split inputs, to keep things polite. */
public InputSplit[] getSplits(JobConf job, int nSplits)
throws IOException {
@@ -69,8 +68,8 @@
}
}
- private RecordReader input;
- private OutputCollector output;
+ private RecordReader<WritableComparable, Writable> input;
+ private OutputCollector<Text, NutchWritable> output;
private Reporter reporter;
private String segmentName;
@@ -455,7 +454,7 @@
return conf.getBoolean("fetcher.store.content", true);
}
- public void run(RecordReader input, OutputCollector output,
+ public void run(RecordReader<WritableComparable, Writable> input, OutputCollector<Text, NutchWritable> output,
Reporter reporter) throws IOException {
this.input = input;
@@ -529,7 +528,7 @@
/** Run the fetcher. */
public static void main(String[] args) throws Exception {
- int res = new Fetcher().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Mar 19 03:34:14 2008
@@ -28,9 +28,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -63,8 +61,8 @@
*
* @author Andrzej Bialecki
*/
-public class DeleteDuplicates extends ToolBase
- implements Mapper, Reducer, OutputFormat {
+public class DeleteDuplicates extends Configured
+ implements Tool, Mapper<WritableComparable, Writable, Text, IntWritable>, Reducer<Text, IntWritable, WritableComparable, Writable>, OutputFormat<WritableComparable, Writable> {
private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
// Algorithm:
@@ -141,7 +139,7 @@
}
- public static class InputFormat extends InputFormatBase {
+ public static class InputFormat extends FileInputFormat<Text, IndexDoc> {
private static final long INDEX_LENGTH = Integer.MAX_VALUE;
/** Return each index as a split. */
@@ -155,7 +153,7 @@
return splits;
}
- public class DDRecordReader implements RecordReader {
+ public class DDRecordReader implements RecordReader<Text, IndexDoc> {
private IndexReader indexReader;
private int maxDoc = 0;
@@ -174,7 +172,7 @@
this.index = index;
}
- public boolean next(WritableComparable key, Writable value)
+ public boolean next(Text key, IndexDoc indexDoc)
throws IOException {
// skip empty indexes
@@ -189,9 +187,8 @@
Document document = indexReader.document(doc);
// fill in key
- ((Text)key).set(document.get("url"));
+ key.set(document.get("url"));
// fill in value
- IndexDoc indexDoc = (IndexDoc)value;
indexDoc.keep = true;
indexDoc.url.set(document.get("url"));
indexDoc.hash.setDigest(document.get("digest"));
@@ -226,11 +223,11 @@
indexReader.close();
}
- public WritableComparable createKey() {
+ public Text createKey() {
return new Text();
}
- public Writable createValue() {
+ public IndexDoc createValue() {
return new IndexDoc();
}
@@ -240,7 +237,7 @@
}
/** Return each index as a split. */
- public RecordReader getRecordReader(InputSplit split,
+ public RecordReader<Text, IndexDoc> getRecordReader(InputSplit split,
JobConf job,
Reporter reporter) throws IOException {
FileSplit fsplit = (FileSplit)split;
@@ -250,27 +247,27 @@
}
}
- public static class HashPartitioner implements Partitioner {
+ public static class HashPartitioner implements Partitioner<MD5Hash, Writable> {
public void configure(JobConf job) {}
public void close() {}
- public int getPartition(WritableComparable key, Writable value,
+ public int getPartition(MD5Hash key, Writable value,
int numReduceTasks) {
- int hashCode = ((MD5Hash)key).hashCode();
+ int hashCode = key.hashCode();
return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
}
}
- public static class UrlsReducer implements Reducer {
+ public static class UrlsReducer implements Reducer<Text, IndexDoc, MD5Hash, IndexDoc> {
public void configure(JobConf job) {}
public void close() {}
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<IndexDoc> values,
+ OutputCollector<MD5Hash, IndexDoc> output, Reporter reporter) throws IOException {
IndexDoc latest = null;
while (values.hasNext()) {
- IndexDoc value = (IndexDoc)values.next();
+ IndexDoc value = values.next();
if (latest == null) {
latest = value;
continue;
@@ -296,7 +293,7 @@
}
}
- public static class HashReducer implements Reducer {
+ public static class HashReducer implements Reducer<MD5Hash, IndexDoc, Text, IndexDoc> {
boolean byScore;
public void configure(JobConf job) {
@@ -304,12 +301,12 @@
}
public void close() {}
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(MD5Hash key, Iterator<IndexDoc> values,
+ OutputCollector<Text, IndexDoc> output, Reporter reporter)
throws IOException {
IndexDoc highest = null;
while (values.hasNext()) {
- IndexDoc value = (IndexDoc)values.next();
+ IndexDoc value = values.next();
// skip already deleted
if (!value.keep) {
LOG.debug("-discard " + value + " (already marked)");
@@ -355,7 +352,7 @@
public void setConf(Configuration conf) {
super.setConf(conf);
try {
- fs = FileSystem.get(conf);
+ if(conf != null) fs = FileSystem.get(conf);
} catch (IOException e) {
throw new RuntimeException(e);
}
@@ -365,7 +362,7 @@
/** Map [*,IndexDoc] pairs to [index,doc] pairs. */
public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter)
+ OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
IndexDoc indexDoc = (IndexDoc)value;
// don't delete these
@@ -375,14 +372,14 @@
}
/** Delete docs named in values from index named in key. */
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(Text key, Iterator<IntWritable> values,
+ OutputCollector<WritableComparable, Writable> output, Reporter reporter)
throws IOException {
Path index = new Path(key.toString());
IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
try {
while (values.hasNext()) {
- IntWritable value = (IntWritable)values.next();
+ IntWritable value = values.next();
LOG.debug("-delete " + index + " doc=" + value);
reader.deleteDocument(value.get());
}
@@ -392,11 +389,11 @@
}
/** Write nothing. */
- public RecordWriter getRecordWriter(final FileSystem fs,
+ public RecordWriter<WritableComparable, Writable> getRecordWriter(final FileSystem fs,
final JobConf job,
final String name,
final Progressable progress) throws IOException {
- return new RecordWriter() {
+ return new RecordWriter<WritableComparable, Writable>() {
public void write(WritableComparable key, Writable value)
throws IOException {
throw new UnsupportedOperationException();
@@ -496,7 +493,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new DeleteDuplicates().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new DeleteDuplicates(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Wed Mar 19 03:34:14 2008
@@ -25,8 +25,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
import org.apache.nutch.util.HadoopFSUtil;
@@ -43,7 +42,7 @@
* @author Doug Cutting
* @author Mike Cafarella
*************************************************************************/
-public class IndexMerger extends ToolBase {
+public class IndexMerger extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(IndexMerger.class);
public static final String DONE_NAME = "merge.done";
@@ -81,17 +80,17 @@
Directory[] dirs = new Directory[indexes.length];
for (int i = 0; i < indexes.length; i++) {
if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); }
- dirs[i] = new FsDirectory(fs, indexes[i], false, this.conf);
+ dirs[i] = new FsDirectory(fs, indexes[i], false, getConf());
}
//
// Merge indices
//
IndexWriter writer = new IndexWriter(localOutput.toString(), null, true);
- writer.setMergeFactor(conf.getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
- writer.setMaxBufferedDocs(conf.getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
- writer.setMaxMergeDocs(conf.getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
- writer.setTermIndexInterval(conf.getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
+ writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR));
+ writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS));
+ writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS));
+ writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL));
writer.setInfoStream(LogUtil.getDebugStream(LOG));
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
@@ -109,7 +108,7 @@
* Create an index for the input files in the named directory.
*/
public static void main(String[] args) throws Exception {
- int res = new IndexMerger().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args);
System.exit(res);
}
@@ -123,7 +122,7 @@
//
// Parse args, read all index directories to be processed
//
- FileSystem fs = FileSystem.get(conf);
+ FileSystem fs = FileSystem.get(getConf());
List<Path> indexDirs = new ArrayList<Path>();
Path workDir = new Path("indexmerger-" + System.currentTimeMillis());
@@ -152,7 +151,7 @@
LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e));
return -1;
} finally {
- FileSystem.getLocal(conf).delete(workDir);
+ FileSystem.getLocal(getConf()).delete(workDir);
}
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Wed Mar 19 03:34:14 2008
@@ -32,12 +32,12 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.*;
/** Sort a Nutch index by page score. Higher scoring documents are assigned
* smaller document numbers. */
-public class IndexSorter extends ToolBase {
+public class IndexSorter extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(IndexSorter.class);
private static class PostingMap implements Comparable<PostingMap> {
@@ -300,7 +300,7 @@
/** */
public static void main(String[] args) throws Exception {
- int res = new IndexSorter().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new IndexSorter(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Mar 19 03:34:14 2008
@@ -27,9 +27,7 @@
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.Progressable;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.*;
import org.apache.nutch.parse.*;
import org.apache.nutch.analysis.*;
@@ -51,7 +49,7 @@
import org.apache.nutch.metadata.Nutch;
/** Create indexes for segments. */
-public class Indexer extends ToolBase implements Reducer, Mapper {
+public class Indexer extends Configured implements Tool, Reducer<Text, NutchWritable, Text, Writable>, Mapper<Text, Writable, Text, NutchWritable> {
public static final String DONE_NAME = "index.done";
@@ -85,8 +83,8 @@
/** Unwrap Lucene Documents created by reduce and add them to an index. */
public static class OutputFormat
- extends org.apache.hadoop.mapred.OutputFormatBase {
- public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
+ extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> {
+ public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job,
String name, final Progressable progress) throws IOException {
final Path perm = new Path(job.getOutputPath(), name);
final Path temp =
@@ -109,12 +107,12 @@
writer.setUseCompoundFile(false);
writer.setSimilarity(new NutchSimilarity());
- return new RecordWriter() {
+ return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() {
boolean closed;
- public void write(WritableComparable key, Writable value)
+ public void write(WritableComparable key, LuceneDocumentWrapper value)
throws IOException { // unwrap & index doc
- Document doc = ((LuceneDocumentWrapper) value).get();
+ Document doc = value.get();
NutchAnalyzer analyzer = factory.get(doc.get("lang"));
if (LOG.isInfoEnabled()) {
LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
@@ -174,8 +172,8 @@
public void close() {}
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(Text key, Iterator<NutchWritable> values,
+ OutputCollector<Text, Writable> output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
CrawlDatum dbDatum = null;
@@ -183,7 +181,7 @@
ParseData parseData = null;
ParseText parseText = null;
while (values.hasNext()) {
- Writable value = ((NutchWritable)values.next()).get(); // unwrap
+ Writable value = values.next().get(); // unwrap
if (value instanceof Inlinks) {
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
@@ -248,7 +246,7 @@
fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
}
// run indexing filters
- doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks);
+ doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
} catch (IndexingException e) {
if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); }
return;
@@ -315,7 +313,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new Indexer().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args);
System.exit(res);
}
@@ -341,8 +339,8 @@
}
}
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter) throws IOException {
+ public void map(Text key, Writable value,
+ OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
output.collect(key, new NutchWritable(value));
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Mar 19 03:34:14 2008
@@ -23,7 +23,7 @@
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
@@ -37,7 +37,7 @@
import java.util.Map.Entry;
/* Parse content in a segment. */
-public class ParseSegment extends Configured implements Mapper, Reducer {
+public class ParseSegment extends Configured implements Tool, Mapper<WritableComparable, Content, Text, ParseImpl>, Reducer<Text, Writable, Text, Writable> {
public static final Log LOG = LogFactory.getLog(Parser.class);
@@ -60,15 +60,14 @@
private Text newKey = new Text();
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter)
+ public void map(WritableComparable key, Content content,
+ OutputCollector<Text, ParseImpl> output, Reporter reporter)
throws IOException {
// convert on the fly from old UTF8 keys
if (key instanceof UTF8) {
newKey.set(key.toString());
key = newKey;
}
- Content content = (Content) value;
ParseResult parseResult = null;
try {
@@ -111,8 +110,8 @@
}
}
- public void reduce(WritableComparable key, Iterator values,
- OutputCollector output, Reporter reporter)
+ public void reduce(Text key, Iterator<Writable> values,
+ OutputCollector<Text, Writable> output, Reporter reporter)
throws IOException {
output.collect(key, (Writable)values.next()); // collect first value
}
@@ -144,6 +143,11 @@
public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args);
+ System.exit(res);
+ }
+
+ public int run(String[] args) throws Exception {
Path segment;
String usage = "Usage: ParseSegment segment";
@@ -151,11 +155,9 @@
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
- }
-
+ }
segment = new Path(args[0]);
-
- ParseSegment parseSegment = new ParseSegment(NutchConfiguration.create());
- parseSegment.parse(segment);
+ parse(segment);
+ return 0;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Wed Mar 19 03:34:14 2008
@@ -25,8 +25,8 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
@@ -38,7 +38,8 @@
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.PartitionUrlByHost;
@@ -55,13 +56,15 @@
*
* @author Andrzej Bialecki
*/
-public class FreeGenerator extends ToolBase {
+public class FreeGenerator extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(FreeGenerator.class);
private static final String FILTER_KEY = "free.generator.filter";
private static final String NORMALIZE_KEY = "free.generator.normalize";
- public static class FG extends MapReduceBase implements Mapper, Reducer {
+ public static class FG extends MapReduceBase
+ implements Mapper<WritableComparable, Text, Text, Generator.SelectorEntry>,
+ Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
private URLNormalizers normalizers = null;
private URLFilters filters = null;
private ScoringFilters scfilters;
@@ -82,7 +85,8 @@
Generator.SelectorEntry entry = new Generator.SelectorEntry();
- public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+ public void map(WritableComparable key, Text value, OutputCollector<Text,
+ Generator.SelectorEntry> output, Reporter reporter) throws IOException {
// value is a line of text
String urlString = value.toString();
try {
@@ -111,7 +115,8 @@
output.collect(url, entry);
}
- public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException {
+ public void reduce(Text key, Iterator<Generator.SelectorEntry> values,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
// pick unique urls from values - discard the reduce key due to hash collisions
HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
while (values.hasNext()) {
@@ -177,7 +182,7 @@
}
public static void main(String[] args) throws Exception {
- int res = new FreeGenerator().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), args);
System.exit(res);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Wed Mar 19 03:34:14 2008
@@ -24,18 +24,18 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
-import org.apache.hadoop.util.ToolBase;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
@@ -65,11 +65,10 @@
* <p>Arc files are tars of compressed gzips which are produced by both the
* internet archive project and the grub distributed crawler project.</p>
*
- * TODO: This class needs to be changed to use ToolRunner instead of ToolBase.
*/
public class ArcSegmentCreator
- extends ToolBase
- implements Mapper {
+ extends Configured
+ implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {
public static final Log LOG = LogFactory.getLog(ArcSegmentCreator.class);
public static final String URL_VERSION = "arc.url.version";
@@ -145,7 +144,7 @@
*
* @return The result of the parse in a ParseStatus object.
*/
- private ParseStatus output(OutputCollector output, String segmentName,
+ private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName,
Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
int status) {
@@ -184,7 +183,7 @@
// set the content signature
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
- content, new ParseStatus().getEmptyParse(conf));
+ content, new ParseStatus().getEmptyParse(getConf()));
datum.setSignature(signature);
}
@@ -266,12 +265,12 @@
* segments.</p>
*
* @param key The arc record header.
- * @param value The arc record raw content bytes.
+ * @param bytes The arc record raw content bytes.
* @param output The output collecter.
* @param reporter The progress reporter.
*/
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter)
+ public void map(Text key, BytesWritable bytes,
+ OutputCollector<Text, NutchWritable> output, Reporter reporter)
throws IOException {
String[] headers = key.toString().split("\\s+");
@@ -289,7 +288,6 @@
// get the raw bytes from the arc file, create a new crawldatum
Text url = new Text();
- BytesWritable bytes = (BytesWritable)value;
CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
1.0f);
String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);
@@ -371,7 +369,7 @@
public static void main(String args[])
throws Exception {
- int res = new ArcSegmentCreator().doMain(NutchConfiguration.create(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args);
System.exit(res);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=638779&r1=638778&r2=638779&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Wed Mar 19 03:34:14 2008
@@ -18,6 +18,7 @@
import java.io.IOException;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
@@ -49,6 +50,23 @@
}
};
+ }
+
+ /**
+ * Turns an array of FileStatus into an array of Paths.
+ */
+ public static Path[] getPaths(FileStatus[] stats) {
+ if (stats == null) {
+ return null;
+ }
+ if (stats.length == 0) {
+ return new Path[0];
+ }
+ Path[] res = new Path[stats.length];
+ for (int i = 0; i < stats.length; i++) {
+ res[i] = stats[i].getPath();
+ }
+ return res;
}
}