You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2016/02/18 19:00:12 UTC
svn commit: r1731102 -
/nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
Author: joyce
Date: Thu Feb 18 18:00:12 2016
New Revision: 1731102
URL: http://svn.apache.org/viewvc?rev=1731102&view=rev
Log:
NUTCH-2218 - Update CrawlComplete util to use Commons CLI
Modified:
nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
Modified: nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java?rev=1731102&r1=1731101&r2=1731102&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java Thu Feb 18 18:00:12 2016
@@ -42,6 +42,15 @@ import org.apache.nutch.util.NutchConfig
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.MissingOptionException;
+
/**
* Extracts some simple crawl completion stats from the crawldb
*
@@ -61,27 +70,60 @@ public class CrawlCompletionStats extend
private int mode = 0;
public int run(String[] args) throws Exception {
- if (args.length < 3) {
- System.err.println("Usage: CrawlCompletionStats inputDirs outDir mode [numOfReducer]");
-
- System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
- System.err.println("\t\t\tE.g.: crawl/crawldb/");
-
- System.err.println("\toutDir\t\tOutput directory where results should be dumped");
-
- System.err.println("\tmode\t\tSet statistics gathering mode");
- System.err.println("\t\t\t\thost\tGather statistics by host");
- System.err.println("\t\t\t\tdomain\tGather statistics by domain");
+ Option helpOpt = new Option("h", "help", false, "Show this message");
+ Option inDirs = OptionBuilder
+ .withArgName("inputDirs")
+ .isRequired()
+ .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
+ .hasArgs()
+ .create("inputDirs");
+ Option outDir = OptionBuilder
+ .withArgName("outputDir")
+ .isRequired()
+ .withDescription("Output directory where results should be dumped")
+ .hasArgs()
+ .create("outputDir");
+ Option modeOpt = OptionBuilder
+ .withArgName("mode")
+ .isRequired()
+ .withDescription("Set statistics gathering mode (by 'host' or by 'domain')")
+ .hasArgs()
+ .create("mode");
+ Option numReducers = OptionBuilder
+ .withArgName("numReducers")
+ .withDescription("Optional number of reduce jobs to use. Defaults to 1")
+ .hasArgs()
+ .create("numReducers");
+
+ Options options = new Options();
+ options.addOption(helpOpt);
+ options.addOption(inDirs);
+ options.addOption(outDir);
+ options.addOption(modeOpt);
+ options.addOption(numReducers);
+
+ CommandLineParser parser = new GnuParser();
+ CommandLine cli;
+
+ try {
+ cli = parser.parse(options, args);
+ } catch (MissingOptionException e) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("CrawlCompletionStats", options, true);
+ return 1;
+ }
- System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+ if (cli.hasOption("help")) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("CrawlCompletionStats", options, true);
return 1;
}
- String inputDir = args[0];
- String outputDir = args[1];
- int numOfReducers = 1;
+ String inputDir = cli.getOptionValue("inputDirs");
+ String outputDir = cli.getOptionValue("outputDir");
- if (args.length > 3) {
+ int numOfReducers = 1;
+ if (cli.hasOption("numReducers")) {
numOfReducers = Integer.parseInt(args[3]);
}
@@ -91,13 +133,13 @@ public class CrawlCompletionStats extend
int mode = 0;
String jobName = "CrawlCompletionStats";
- if (args[2].equals("host")) {
+ if (cli.getOptionValue("mode").equals("host")) {
jobName = "Host CrawlCompletionStats";
mode = MODE_HOST;
- } else if (args[2].equals("domain")) {
+ } else if (cli.getOptionValue("mode").equals("domain")) {
jobName = "Domain CrawlCompletionStats";
mode = MODE_DOMAIN;
- }
+ }
Configuration conf = getConf();
conf.setInt("domain.statistics.mode", mode);
@@ -108,7 +150,7 @@ public class CrawlCompletionStats extend
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+ File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
}