You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/03/31 19:34:38 UTC
svn commit: r1670404 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDbReader.java
src/java/org/apache/nutch/tools/Benchmark.java
Author: markus
Date: Tue Mar 31 17:34:37 2015
New Revision: 1670404
URL: http://svn.apache.org/r1670404
Log:
NUTCH-1979 CrawlDbReader to implement Tool
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar 31 17:34:37 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1979 CrawlDbReader to implement Tool (markus)
+
* NUTCH-1970 Pretty print JSON output in config resource (Tyler Pasulich, mattmann)
* NUTCH-1976 Allow Users to Set Hostname for Server (Tyler Palsulich via mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Mar 31 17:34:37 2015
@@ -33,7 +33,7 @@ import java.util.TreeMap;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
@@ -59,6 +59,8 @@ import org.apache.hadoop.mapred.lib.Hash
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
@@ -69,13 +71,13 @@ import org.apache.nutch.util.StringUtil;
* @author Andrzej Bialecki
*
*/
-public class CrawlDbReader implements Closeable {
+public class CrawlDbReader extends Configured implements Closeable, Tool {
public static final Logger LOG = LoggerFactory.getLogger(CrawlDbReader.class);
private MapFile.Reader[] readers = null;
- private void openReaders(String crawlDb, Configuration config)
+ private void openReaders(String crawlDb, JobConf config)
throws IOException {
if (readers != null)
return;
@@ -343,7 +345,7 @@ public class CrawlDbReader implements Cl
closeReaders();
}
- public void processStatJob(String crawlDb, Configuration config, boolean sort)
+ public void processStatJob(String crawlDb, JobConf config, boolean sort)
throws IOException {
if (LOG.isInfoEnabled()) {
@@ -443,7 +445,7 @@ public class CrawlDbReader implements Cl
}
- public CrawlDatum get(String crawlDb, String url, Configuration config)
+ public CrawlDatum get(String crawlDb, String url, JobConf config)
throws IOException {
Text key = new Text(url);
CrawlDatum val = new CrawlDatum();
@@ -453,7 +455,7 @@ public class CrawlDbReader implements Cl
return res;
}
- public void readUrl(String crawlDb, String url, Configuration config)
+ public void readUrl(String crawlDb, String url, JobConf config)
throws IOException {
CrawlDatum res = get(crawlDb, url, config);
System.out.println("URL: " + url);
@@ -465,7 +467,7 @@ public class CrawlDbReader implements Cl
}
public void processDumpJob(String crawlDb, String output,
- Configuration config, String format, String regex, String status,
+ JobConf config, String format, String regex, String status,
Integer retry) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb dump: starting");
@@ -554,7 +556,7 @@ public class CrawlDbReader implements Cl
}
public void processTopNJob(String crawlDb, long topN, float min,
- String output, Configuration config) throws IOException {
+ String output, JobConf config) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
@@ -609,7 +611,7 @@ public class CrawlDbReader implements Cl
}
- public static void main(String[] args) throws IOException {
+ public int run(String[] args) throws IOException {
@SuppressWarnings("resource")
CrawlDbReader dbr = new CrawlDbReader();
@@ -638,11 +640,11 @@ public class CrawlDbReader implements Cl
System.err
.println("\t\t[<min>]\tskip records with scores below this value.");
System.err.println("\t\t\tThis can significantly improve performance.");
- return;
+ return -1;
}
String param = null;
String crawlDb = args[0];
- Configuration conf = NutchConfiguration.create();
+ JobConf job = new NutchJob(getConf());
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
boolean toSort = false;
@@ -650,7 +652,7 @@ public class CrawlDbReader implements Cl
toSort = true;
i++;
}
- dbr.processStatJob(crawlDb, conf, toSort);
+ dbr.processStatJob(crawlDb, job, toSort);
} else if (args[i].equals("-dump")) {
param = args[++i];
String format = "normal";
@@ -675,10 +677,10 @@ public class CrawlDbReader implements Cl
i = i + 2;
}
}
- dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry);
+ dbr.processDumpJob(crawlDb, param, job, format, regex, status, retry);
} else if (args[i].equals("-url")) {
param = args[++i];
- dbr.readUrl(crawlDb, param, conf);
+ dbr.readUrl(crawlDb, param, job);
} else if (args[i].equals("-topN")) {
param = args[++i];
long topN = Long.parseLong(param);
@@ -687,11 +689,18 @@ public class CrawlDbReader implements Cl
if (i < args.length - 1) {
min = Float.parseFloat(args[++i]);
}
- dbr.processTopNJob(crawlDb, topN, min, param, conf);
+ dbr.processTopNJob(crawlDb, topN, min, param, job);
} else {
System.err.println("\nError: wrong argument " + args[i]);
+ return -1;
}
}
- return;
+ return 0;
}
-}
+
+ public static void main(String[] args) throws Exception {
+ int result = ToolRunner.run(NutchConfiguration.create(),
+ new CrawlDbReader(), args);
+ System.exit(result);
+ }
+}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java?rev=1670404&r1=1670403&r2=1670404&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java Tue Mar 31 17:34:37 2015
@@ -277,7 +277,7 @@ public class Benchmark extends Configure
}
res.elapsed = System.currentTimeMillis() - res.elapsed;
CrawlDbReader dbreader = new CrawlDbReader();
- dbreader.processStatJob(crawlDb.toString(), conf, false);
+ dbreader.processStatJob(crawlDb.toString(), job, false);
return res;
}