You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/08/01 16:21:06 UTC
svn commit: r1368012 - in /nutch/branches/2.x: ./
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/indexer/solr/
Author: ferdy
Date: Wed Aug 1 14:21:05 2012
New Revision: 1368012
URL: http://svn.apache.org/viewvc?rev=1368012&view=rev
Log:
NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat)
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Aug 1 14:21:05 2012
@@ -1,6 +1,7 @@
Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1444 Indexing should not create temporary files (do not extend from FileOutputFormat) (ferdy)
* NUTCH-1443 Solr schema version is invalid (markus)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java Wed Aug 1 14:21:05 2012
@@ -18,12 +18,13 @@ package org.apache.nutch.indexer;
import java.io.IOException;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-public class IndexerOutputFormat
-extends FileOutputFormat<String, NutchDocument> {
+public class IndexerOutputFormat extends OutputFormat<String, NutchDocument> {
@Override
public RecordWriter<String, NutchDocument> getRecordWriter(
@@ -33,7 +34,7 @@ extends FileOutputFormat<String, NutchDo
NutchIndexWriterFactory.getNutchIndexWriters(job.getConfiguration());
for (final NutchIndexWriter writer : writers) {
- writer.open(job, FileOutputFormat.getUniqueFile(job, "part", ""));
+ writer.open(job);
}
return new RecordWriter<String, NutchDocument>() {
@@ -54,4 +55,33 @@ extends FileOutputFormat<String, NutchDo
}
};
}
+
+ @Override
+ public void checkOutputSpecs(JobContext jobContext) throws IOException,
+ InterruptedException {
+ }
+
+ @Override
+ public OutputCommitter getOutputCommitter(TaskAttemptContext arg0)
+ throws IOException, InterruptedException {
+ //return an empty outputcommitter
+ return new OutputCommitter() {
+ @Override
+ public void setupTask(TaskAttemptContext arg0) throws IOException {
+ }
+ @Override
+ public void setupJob(JobContext arg0) throws IOException {
+ }
+ @Override
+ public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
+ return false;
+ }
+ @Override
+ public void commitTask(TaskAttemptContext arg0) throws IOException {
+ }
+ @Override
+ public void abortTask(TaskAttemptContext arg0) throws IOException {
+ }
+ };
+ }
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchIndexWriter.java Wed Aug 1 14:21:05 2012
@@ -22,7 +22,7 @@ import org.apache.hadoop.mapreduce.TaskA
import org.apache.nutch.indexer.NutchDocument;
public interface NutchIndexWriter {
- public void open(TaskAttemptContext job, String name) throws IOException;
+ public void open(TaskAttemptContext job) throws IOException;
public void write(NutchDocument doc) throws IOException;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrIndexerJob.java Wed Aug 1 14:21:05 2012
@@ -16,28 +16,19 @@
*/
package org.apache.nutch.indexer.solr;
-import java.io.IOException;
-import java.util.Collections;
-import java.util.HashMap;
import java.util.Map;
-import java.util.Random;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.indexer.IndexerJob;
import org.apache.nutch.indexer.NutchIndexWriterFactory;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.ToolUtil;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class SolrIndexerJob extends IndexerJob {
@@ -51,30 +42,22 @@ public class SolrIndexerJob extends Inde
getConf().set(SolrConstants.SERVER_URL, solrUrl);
currentJob = createIndexJob(getConf(), "solr-index", batchId);
- Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
- + new Random().nextInt());
- FileOutputFormat.setOutputPath(currentJob, tmp);
currentJob.waitForCompletion(true);
ToolUtil.recordJobStatus(null, currentJob, results);
return results;
}
- private void indexSolr(String solrUrl, String batchId) throws Exception {
+ public void indexSolr(String solrUrl, String batchId) throws Exception {
LOG.info("SolrIndexerJob: starting");
- try {
- run(ToolUtil.toArgMap(
- Nutch.ARG_SOLR, solrUrl,
- Nutch.ARG_BATCH, batchId));
- // do the commits once and for all the reducers in one go
- SolrServer solr = new CommonsHttpSolrServer(solrUrl);
- if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
- solr.commit();
- }
- } finally {
- FileSystem.get(getConf()).delete(
- FileOutputFormat.getOutputPath(currentJob), true);
+ run(ToolUtil.toArgMap(
+ Nutch.ARG_SOLR, solrUrl,
+ Nutch.ARG_BATCH, batchId));
+ // do the commits once and for all the reducers in one go
+ SolrServer solr = new CommonsHttpSolrServer(solrUrl);
+ if (getConf().getBoolean(SolrConstants.COMMIT_INDEX, true)) {
+ solr.commit();
}
LOG.info("SolrIndexerJob: done.");
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1368012&r1=1368011&r2=1368012&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed Aug 1 14:21:05 2012
@@ -34,7 +34,7 @@ import org.apache.solr.common.SolrInputD
public class SolrWriter implements NutchIndexWriter {
- public static Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
+ public static final Logger LOG = LoggerFactory.getLogger(SolrWriter.class);
private SolrServer solr;
private SolrMappingReader solrMapping;
@@ -45,7 +45,7 @@ public class SolrWriter implements Nutch
private int commitSize;
@Override
- public void open(TaskAttemptContext job, String name)
+ public void open(TaskAttemptContext job)
throws IOException {
Configuration conf = job.getConfiguration();
solr = new CommonsHttpSolrServer(conf.get(SolrConstants.SERVER_URL));