You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/08/01 16:21:06 UTC

svn commit: r1368012 - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/solr/

Author: ferdy
Date: Wed Aug  1 14:21:05 2012
New Revision: 1368012

URL: http://svn.apache.org/viewvc?rev=1368012&view=rev
Log:
NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat)

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Aug  1 14:21:05 2012
@@ -1,6 +1,7 @@
 Nutch Change Log
 
 Release 2.1 - Current Development
+* NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat) (ferdy)
 
 * NUTCH-1443 Solr schema version is invalid (markus)
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Wed Aug  1 14:21:05 2012
@@ -18,12 +18,13 @@ package org.apache.nutch.indexer;
 
 import java.io.IOException;
 
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
-public class IndexerOutputFormat
-extends FileOutputFormat<String, NutchDocument> {
+public class IndexerOutputFormat extends OutputFormat<String, NutchDocument> {
 
   @Override
   public RecordWriter<String, NutchDocument> getRecordWriter(
@@ -33,7 +34,7 @@ extends FileOutputFormat<String, NutchDo
       NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
 
     for (final NutchIndexWriter writer : writers) {
-      writer.open(job, FileOutputFormat.getUniqueFile(job, "part", ""));
+      writer.open(job);
     }
 
     return new RecordWriter<String, NutchDocument>() {
@@ -54,4 +55,33 @@ extends FileOutputFormat<String, NutchDo
       }
     };
   }
+
+  @Override
+  public void checkOutputSpecs(JobContext jobContext) throws IOException,
+      InterruptedException {
+  }
+
+  @Override
+  public OutputCommitter getOutputCommitter(TaskAttemptContext arg0)
+      throws IOException, InterruptedException {
+    //return an empty outputcommitter
+    return new OutputCommitter() {
+      @Override
+      public void setupTask(TaskAttemptContext arg0) throws IOException {
+      }
+      @Override
+      public void setupJob(JobContext arg0) throws IOException {
+      }
+      @Override
+      public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
+        return false;
+      }
+      @Override
+      public void commitTask(TaskAttemptContext arg0) throws IOException {
+      }
+      @Override
+      public void abortTask(TaskAttemptContext arg0) throws IOException {
+      }
+    };
+  }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java Wed Aug  1 14:21:05 2012
@@ -22,7 +22,7 @@ import org.apache.hadoop.mapreduce.TaskA
 import org.apache.nutch.indexer.NutchDocument;
 
 public interface NutchIndexWriter {
-  public void open(TaskAttemptContext job, String name) throws IOException;
+  public void open(TaskAttemptContext job) throws IOException;
 
   public void write(NutchDocument doc) throws IOException;
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java Wed Aug  1 14:21:05 2012
@@ -16,28 +16,19 @@
  */
 package org.apache.nutch.indexer.solr;
 
-import java.io.IOException;
-import java.util.Collections;
-import java.util.HashMap;
 import java.util.Map;
-import java.util.Random;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.indexer.IndexerJob;
 import org.apache.nutch.indexer.NutchIndexWriterFactory;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.ToolUtil;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class SolrIndexerJob extends IndexerJob {
 
@@ -51,30 +42,22 @@ public class SolrIndexerJob extends Inde
     getConf().set(SolrConstants.SERVER_URL, solrUrl);
 
     currentJob = createIndexJob(getConf(), "solr-index", batchId);
-    Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
-                + new Random().nextInt());
 
-    FileOutputFormat.setOutputPath(currentJob, tmp);
     currentJob.waitForCompletion(true);
     ToolUtil.recordJobStatus(null, currentJob, results);
     return results;
   }
 
-  private void indexSolr(String solrUrl, String batchId) throws Exception {
+  public void indexSolr(String solrUrl, String batchId) throws Exception {
     LOG.info("SolrIndexerJob: starting");
 
-    try {
-      run(ToolUtil.toArgMap(
-          Nutch.ARG_SOLR, solrUrl,
-          Nutch.ARG_BATCH, batchId));
-      // do the commits once and for all the reducers in one go
-      SolrServer solr = new CommonsHttpSolrServer(solrUrl);
-      if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
-        solr.commit();
-      }
-    } finally {
-      FileSystem.get(getConf()).delete(
-          FileOutputFormat.getOutputPath(currentJob), true);
+    run(ToolUtil.toArgMap(
+        Nutch.ARG_SOLR, solrUrl,
+        Nutch.ARG_BATCH, batchId));
+    // do the commits once and for all the reducers in one go
+    SolrServer solr = new CommonsHttpSolrServer(solrUrl);
+    if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
+      solr.commit();
     }
     LOG.info("SolrIndexerJob: done.");
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed Aug  1 14:21:05 2012
@@ -34,7 +34,7 @@ import org.apache.solr.common.SolrInputD
 
 public class SolrWriter implements NutchIndexWriter {
 
-  public static Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
+  public static final Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
 
   private SolrServer solr;
   private SolrMappingReader solrMapping;
@@ -45,7 +45,7 @@ public class SolrWriter implements Nutch
   private int commitSize;
 
   @Override
-  public void open(TaskAttemptContext job, String name)
+  public void open(TaskAttemptContext job)
   throws IOException {
     Configuration conf = job.getConfiguration();
     solr = new CommonsHttpSolrServer(conf.get(SolrConstants.SERVER_URL));