You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/10/13 23:59:08 UTC

svn commit: r320934 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: crawl/Crawl.java indexer/IndexMerger.java

Author: cutting
Date: Thu Oct 13 14:59:06 2005
New Revision: 320934

URL: http://svn.apache.org/viewcvs?rev=320934&view=rev
Log:
Fix index merger for mapred.

Modified:
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=320934&r1=320933&r2=320934&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu Oct 13 14:59:06 2005
@@ -26,6 +26,7 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
 import org.apache.nutch.mapred.*;
+import org.apache.nutch.indexer.IndexMerger;
 
 public class Crawl {
   public static final Logger LOG =
@@ -91,7 +92,10 @@
     File crawlDb = new File(dir + "/crawldb");
     File linkDb = new File(dir + "/linkdb");
     File segments = new File(dir + "/segments");
-    File index = new File(dir + "/indexes");
+    File indexes = new File(dir + "/indexes");
+    File index = new File(dir + "/index");
+
+    File tmpDir = conf.getLocalFile("crawl", getDate());
       
     // initialize crawlDb
     new Injector(conf).inject(crawlDb, rootUrlFile);
@@ -109,9 +113,10 @@
       
     new LinkDb(conf).invert(linkDb, segments); // invert links
 
-    // index & dedup
-    new Indexer(conf).index(index, linkDb, fs.listFiles(segments));
-    new DeleteDuplicates(conf).dedup(new File[] { index });
+    // index, dedup & merge
+    new Indexer(conf).index(indexes, linkDb, fs.listFiles(segments));
+    new DeleteDuplicates(conf).dedup(new File[] { indexes });
+    new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir).merge();
 
     LOG.info("crawl finished: " + dir);
   }

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=320934&r1=320933&r2=320934&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexMerger.java Thu Oct 13 14:59:06 2005
@@ -53,39 +53,39 @@
   private NutchFileSystem nfs;
   private File outputIndex;
   private File localWorkingDir;
-  private File[] segments;
+  private File[] indexes;
 
   /**
-   * Merge all of the segments given
+   * Merge all of the indexes given
    */
-  public IndexMerger(NutchFileSystem nfs, File[] segments, File outputIndex, File localWorkingDir) throws IOException {
+  public IndexMerger(NutchFileSystem nfs, File[] indexes, File outputIndex, File localWorkingDir) throws IOException {
       this.nfs = nfs;
-      this.segments = segments;
+      this.indexes = indexes;
       this.outputIndex = outputIndex;
       this.localWorkingDir = localWorkingDir;
   }
 
   /**
-   * Load all input segment indices, then add to the single output index
+   * All all input indexes to the single output index
    */
   public void merge() throws IOException {
     //
     // Open local copies of NFS indices
     //
-    Directory[] dirs = new Directory[segments.length];
-    File[] localSegments = new File[segments.length];
-    for (int i = 0; i < segments.length; i++) {
-        File tmpFile = new File(localWorkingDir, "indexmerge-" + new SimpleDateFormat("yyyMMddHHmmss").format(new Date(System.currentTimeMillis())));
-        localSegments[i] = nfs.startLocalInput(new File(segments[i], "index"), tmpFile);
-        dirs[i] = FSDirectory.getDirectory(localSegments[i], false);
-    }
 
-    //
     // Get local output target
     //
     File tmpLocalOutput = new File(localWorkingDir, "merge-output");
     File localOutput = nfs.startLocalOutput(outputIndex, tmpLocalOutput);
 
+    Directory[] dirs = new Directory[indexes.length];
+    for (int i = 0; i < indexes.length; i++) {
+      LOG.info("Adding " + indexes[i]);
+      dirs[i] = new NdfsDirectory(nfs, indexes[i], false);
+    }
+
+    //
+
     //
     // Merge indices
     //
@@ -105,12 +105,6 @@
     //
     nfs.completeLocalOutput(outputIndex, tmpLocalOutput);
 
-    //
-    // Delete all local inputs, if necessary
-    //
-    for (int i = 0; i < localSegments.length; i++) {
-        nfs.completeLocalInput(localSegments[i]);
-    }
     localWorkingDir.delete();
   }
 
@@ -118,50 +112,46 @@
    * Create an index for the input files in the named directory. 
    */
   public static void main(String[] args) throws Exception {
-    String usage = "IndexMerger (-local | -ndfs <nameserver:port>) [-workingdir <workingdir>] outputIndex segments...";
+    String usage = "IndexMerger [-workingdir <workingdir>] outputIndex indexesDir...";
     if (args.length < 2) {
       System.err.println("Usage: " + usage);
       return;
     }
 
     //
-    // Parse args, read all segment directories to be processed
+    // Parse args, read all index directories to be processed
     //
-    NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
-    try {
-        File workingDir = new File(new File("").getCanonicalPath());
-        Vector segments = new Vector();
-
-        int i = 0;
-        if ("-workingdir".equals(args[i])) {
-            i++;
-            workingDir = new File(new File(args[i++]).getCanonicalPath());
-        }
-        File outputIndex = new File(args[i++]);
-
-        for (; i < args.length; i++) {
-            if (args[i] != null) {
-                segments.add(new File(args[i]));
-            }
-        }
-        workingDir = new File(workingDir, "indexmerger-workingdir");
-
-        //
-        // Merge the indices
-        //
-        File[] segmentFiles = (File[]) segments.toArray(new File[segments.size()]);
-        LOG.info("merging segment indexes to: " + outputIndex);
-
-        if (workingDir.exists()) {
-            FileUtil.fullyDelete(workingDir);
-        }
-        workingDir.mkdirs();
-        IndexMerger merger = new IndexMerger(nfs, segmentFiles, outputIndex, workingDir);
-        merger.merge();
-        LOG.info("done merging");
-        FileUtil.fullyDelete(workingDir);
-    } finally {
-        nfs.close();
+    NutchFileSystem nfs = NutchFileSystem.get();
+    File workDir = new File(new File("").getCanonicalPath());
+    List indexDirs = new ArrayList();
+
+    int i = 0;
+    if ("-workingdir".equals(args[i])) {
+      i++;
+      workDir = new File(new File(args[i++]).getCanonicalPath());
+    }
+    workDir = new File(workDir, "indexmerger-workingdir");
+
+    File outputIndex = new File(args[i++]);
+
+    for (; i < args.length; i++) {
+      indexDirs.addAll(Arrays.asList(nfs.listFiles(new File(args[i]))));
+    }
+
+    //
+    // Merge the indices
+    //
+    LOG.info("merging indexes to: " + outputIndex);
+
+    File[] indexFiles = (File[])indexDirs.toArray(new File[indexDirs.size()]);
+
+    if (workDir.exists()) {
+      FileUtil.fullyDelete(workDir);
     }
+    workDir.mkdirs();
+    IndexMerger merger = new IndexMerger(nfs,indexFiles,outputIndex,workDir);
+    merger.merge();
+    LOG.info("done merging");
+    FileUtil.fullyDelete(workDir);
   }
 }