You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/13 23:59:08 UTC
svn commit: r320934 - in
/lucene/nutch/branches/mapred/src/java/org/apache/nutch: crawl/Crawl.java
indexer/IndexMerger.java
Author: cutting
Date: Thu Oct 13 14:59:06 2005
New Revision: 320934
URL: http://svn.apache.org/viewcvs?rev=320934&view=rev
Log:
Fix index merger for mapred.
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=320934&r1=320933&r2=320934&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu Oct 13 14:59:06 2005
@@ -26,6 +26,7 @@
import org.apache.nutch.fs.*;
import org.apache.nutch.util.*;
import org.apache.nutch.mapred.*;
+import org.apache.nutch.indexer.IndexMerger;
public class Crawl {
public static final Logger LOG =
@@ -91,7 +92,10 @@
File crawlDb = new File(dir + "/crawldb");
File linkDb = new File(dir + "/linkdb");
File segments = new File(dir + "/segments");
- File index = new File(dir + "/indexes");
+ File indexes = new File(dir + "/indexes");
+ File index = new File(dir + "/index");
+
+ File tmpDir = conf.getLocalFile("crawl", getDate());
// initialize crawlDb
new Injector(conf).inject(crawlDb, rootUrlFile);
@@ -109,9 +113,10 @@
new LinkDb(conf).invert(linkDb, segments); // invert links
- // index & dedup
- new Indexer(conf).index(index, linkDb, fs.listFiles(segments));
- new DeleteDuplicates(conf).dedup(new File[] { index });
+ // index, dedup & merge
+ new Indexer(conf).index(indexes, linkDb, fs.listFiles(segments));
+ new DeleteDuplicates(conf).dedup(new File[] { indexes });
+ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge();
LOG.info("crawl finished: " + dir);
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=320934&r1=320933&r2=320934&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java Thu Oct 13 14:59:06 2005
@@ -53,39 +53,39 @@
private NutchFileSystem nfs;
private File outputIndex;
private File localWorkingDir;
- private File[] segments;
+ private File[] indexes;
/**
- * Merge all of the segments given
+ * Merge all of the indexes given
*/
- public IndexMerger(NutchFileSystem nfs, File[] segments, File outputIndex, File localWorkingDir) throws IOException {
+ public IndexMerger(NutchFileSystem nfs, File[] indexes, File outputIndex, File localWorkingDir) throws IOException {
this.nfs = nfs;
- this.segments = segments;
+ this.indexes = indexes;
this.outputIndex = outputIndex;
this.localWorkingDir = localWorkingDir;
}
/**
- * Load all input segment indices, then add to the single output index
+ * All all input indexes to the single output index
*/
public void merge() throws IOException {
//
// Open local copies of NFS indices
//
- Directory[] dirs = new Directory[segments.length];
- File[] localSegments = new File[segments.length];
- for (int i = 0; i < segments.length; i++) {
- File tmpFile = new File(localWorkingDir, "indexmerge-" + new SimpleDateFormat("yyyMMddHHmmss").format(new Date(System.currentTimeMillis())));
- localSegments[i] = nfs.startLocalInput(new File(segments[i], "index"), tmpFile);
- dirs[i] = FSDirectory.getDirectory(localSegments[i], false);
- }
- //
// Get local output target
//
File tmpLocalOutput = new File(localWorkingDir, "merge-output");
File localOutput = nfs.startLocalOutput(outputIndex, tmpLocalOutput);
+ Directory[] dirs = new Directory[indexes.length];
+ for (int i = 0; i < indexes.length; i++) {
+ LOG.info("Adding " + indexes[i]);
+ dirs[i] = new NdfsDirectory(nfs, indexes[i], false);
+ }
+
+ //
+
//
// Merge indices
//
@@ -105,12 +105,6 @@
//
nfs.completeLocalOutput(outputIndex, tmpLocalOutput);
- //
- // Delete all local inputs, if necessary
- //
- for (int i = 0; i < localSegments.length; i++) {
- nfs.completeLocalInput(localSegments[i]);
- }
localWorkingDir.delete();
}
@@ -118,50 +112,46 @@
* Create an index for the input files in the named directory.
*/
public static void main(String[] args) throws Exception {
- String usage = "IndexMerger (-local | -ndfs <nameserver:port>) [-workingdir <workingdir>] outputIndex segments...";
+ String usage = "IndexMerger [-workingdir <workingdir>] outputIndex indexesDir...";
if (args.length < 2) {
System.err.println("Usage: " + usage);
return;
}
//
- // Parse args, read all segment directories to be processed
+ // Parse args, read all index directories to be processed
//
- NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
- try {
- File workingDir = new File(new File("").getCanonicalPath());
- Vector segments = new Vector();
-
- int i = 0;
- if ("-workingdir".equals(args[i])) {
- i++;
- workingDir = new File(new File(args[i++]).getCanonicalPath());
- }
- File outputIndex = new File(args[i++]);
-
- for (; i < args.length; i++) {
- if (args[i] != null) {
- segments.add(new File(args[i]));
- }
- }
- workingDir = new File(workingDir, "indexmerger-workingdir");
-
- //
- // Merge the indices
- //
- File[] segmentFiles = (File[]) segments.toArray(new File[segments.size()]);
- LOG.info("merging segment indexes to: " + outputIndex);
-
- if (workingDir.exists()) {
- FileUtil.fullyDelete(workingDir);
- }
- workingDir.mkdirs();
- IndexMerger merger = new IndexMerger(nfs, segmentFiles, outputIndex, workingDir);
- merger.merge();
- LOG.info("done merging");
- FileUtil.fullyDelete(workingDir);
- } finally {
- nfs.close();
+ NutchFileSystem nfs = NutchFileSystem.get();
+ File workDir = new File(new File("").getCanonicalPath());
+ List indexDirs = new ArrayList();
+
+ int i = 0;
+ if ("-workingdir".equals(args[i])) {
+ i++;
+ workDir = new File(new File(args[i++]).getCanonicalPath());
+ }
+ workDir = new File(workDir, "indexmerger-workingdir");
+
+ File outputIndex = new File(args[i++]);
+
+ for (; i < args.length; i++) {
+ indexDirs.addAll(Arrays.asList(nfs.listFiles(new File(args[i]))));
+ }
+
+ //
+ // Merge the indices
+ //
+ LOG.info("merging indexes to: " + outputIndex);
+
+ File[] indexFiles = (File[])indexDirs.toArray(new File[indexDirs.size()]);
+
+ if (workDir.exists()) {
+ FileUtil.fullyDelete(workDir);
}
+ workDir.mkdirs();
+ IndexMerger merger = new IndexMerger(nfs,indexFiles,outputIndex,workDir);
+ merger.merge();
+ LOG.info("done merging");
+ FileUtil.fullyDelete(workDir);
}
}