You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Susam Pal <su...@gmail.com> on 2008/01/31 20:54:49 UTC
Recrawl using org.apache.nutch.crawl.Crawl
I am interested to know the reason why the following check is done
whether a crawl directory already exists.
FileSystem fs = FileSystem.get(job);
if (fs.exists(dir)) {
throw new RuntimeException(dir + " already exists.");
}
Is it only to save the user from overwriting his crawl directory? If
yes, I have written a small patch that can be used to recrawl over the
same crawl directory by adding a "-force" option to the "bin/nutch
crawl" command line. With this patch, one can crawl and recrawl in the
following manner:-
bin/nutch crawl urls -dir crawl -depth 2 -topN 10 -threads 5
bin/nutch crawl urls -dir crawl -depth 2 -topN 10 -threads 5 -force
Also, one may always use the -force option (even for the first crawl):-
bin/nutch crawl urls -dir crawl -depth 2 -topN 10 -threads 5 -force
bin/nutch crawl urls -dir crawl -depth 2 -topN 10 -threads 5 -force
If one tries to crawl without the -force option when the crawl
directory already exists, he/she finds a small warning along with the
error message:-
# bin/nutch crawl urls -dir crawl -depth 2 -topN 10 -threads 5
Exception in thread "main" java.lang.RuntimeException: crawl already
exists. Add -force option to recrawl.
at org.apache.nutch.crawl.Crawl.main(Crawl.java:89)
This patch, doesn't affect the crawl without the -force option. Is
this going to be useful?
I have included the patch both as text (after the signature) and as an
attachment.
Regards,
Susam Pal
Index: src/java/org/apache/nutch/crawl/Crawl.java
===================================================================
--- src/java/org/apache/nutch/crawl/Crawl.java (revision 617192)
+++ src/java/org/apache/nutch/crawl/Crawl.java (working copy)
@@ -49,7 +49,8 @@
public static void main(String args[]) throws Exception {
if (args.length < 1) {
System.out.println
- ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]");
+ ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] " +
+ "[-topN N] [-force]");
return;
}
@@ -62,6 +63,7 @@
int threads = job.getInt("fetcher.threads.fetch", 10);
int depth = 5;
int topN = Integer.MAX_VALUE;
+ boolean force = false;
for (int i = 0; i < args.length; i++) {
if ("-dir".equals(args[i])) {
@@ -76,14 +78,17 @@
} else if ("-topN".equals(args[i])) {
topN = Integer.parseInt(args[i+1]);
i++;
+ } else if ("-force".equals(args[i])) {
+ force = true;
} else if (args[i] != null) {
rootUrlDir = new Path(args[i]);
}
}
FileSystem fs = FileSystem.get(job);
- if (fs.exists(dir)) {
- throw new RuntimeException(dir + " already exists.");
+ if (force == false && fs.exists(dir)) {
+ throw new RuntimeException(dir + " already exists. Add -force "
+ + "option to recrawl.");
}
if (LOG.isInfoEnabled()) {
@@ -93,6 +98,8 @@
LOG.info("depth = " + depth);
if (topN != Integer.MAX_VALUE)
LOG.info("topN = " + topN);
+ if (force)
+ LOG.info("-force option found");
}
Path crawlDb = new Path(dir + "/crawldb");
@@ -100,6 +107,7 @@
Path segments = new Path(dir + "/segments");
Path indexes = new Path(dir + "/indexes");
Path index = new Path(dir + "/index");
+ Path newIndex = new Path(dir + "/newIndex");
Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
Injector injector = new Injector(conf);
@@ -131,10 +139,27 @@
if (i > 0) {
linkDbTool.invert(linkDb, segments, true, true, false); // invert links
+ // In a recrawl, delete the indexes generated by any past crawl
+ if (force) {
+ fs.delete(indexes);
+ }
+
// index, dedup & merge
- indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments,
HadoopFSUtil.getPassAllFilter()));
+ indexer.index(indexes, crawlDb, linkDb,
+ fs.listPaths(segments, HadoopFSUtil.getPassAllFilter()));
dedup.dedup(new Path[] { indexes });
- merger.merge(fs.listPaths(indexes,
HadoopFSUtil.getPassAllFilter()), index, tmpDir);
+ if (force) {
+ // Since this is a recrawl, merge the indexes into a new directory
+ merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()),
+ newIndex, tmpDir);
+
+ // Replace the old index with the new merged index.
+ fs.delete(index);
+ fs.rename(newIndex, index);
+ } else {
+ merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()),
+ index, tmpDir);
+ }
} else {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}