You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jo...@apache.org on 2016/02/18 19:00:12 UTC

svn commit: r1731102 - /nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java

Author: joyce
Date: Thu Feb 18 18:00:12 2016
New Revision: 1731102

URL: http://svn.apache.org/viewvc?rev=1731102&view=rev
Log:
NUTCH-2218 - Update CrawlComplete util to use Commons CLI

Modified:
    nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java

Modified: nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java?rev=1731102&r1=1731101&r2=1731102&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java Thu Feb 18 18:00:12 2016
@@ -42,6 +42,15 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.MissingOptionException;
+
 /**
  * Extracts some simple crawl completion stats from the crawldb
  *
@@ -61,27 +70,60 @@ public class CrawlCompletionStats extend
   private int mode = 0;
 
   public int run(String[] args) throws Exception {
-    if (args.length < 3) {
-      System.err.println("Usage: CrawlCompletionStats inputDirs outDir mode [numOfReducer]");
-
-      System.err.println("\tinputDirs\tComma separated list of crawldb input directories");
-      System.err.println("\t\t\tE.g.: crawl/crawldb/");
-
-      System.err.println("\toutDir\t\tOutput directory where results should be dumped");
-
-      System.err.println("\tmode\t\tSet statistics gathering mode");
-      System.err.println("\t\t\t\thost\tGather statistics by host");
-      System.err.println("\t\t\t\tdomain\tGather statistics by domain");
+    Option helpOpt = new Option("h", "help", false, "Show this message");
+    Option inDirs = OptionBuilder
+        .withArgName("inputDirs")
+        .isRequired()
+        .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
+        .hasArgs()
+        .create("inputDirs");
+    Option outDir = OptionBuilder
+        .withArgName("outputDir")
+        .isRequired()
+        .withDescription("Output directory where results should be dumped")
+        .hasArgs()
+        .create("outputDir");
+    Option modeOpt = OptionBuilder
+        .withArgName("mode")
+        .isRequired()
+        .withDescription("Set statistics gathering mode (by 'host' or by 'domain')")
+        .hasArgs()
+        .create("mode");
+    Option numReducers = OptionBuilder
+        .withArgName("numReducers")
+        .withDescription("Optional number of reduce jobs to use. Defaults to 1")
+        .hasArgs()
+        .create("numReducers");
+
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(inDirs);
+    options.addOption(outDir);
+    options.addOption(modeOpt);
+    options.addOption(numReducers);
+
+    CommandLineParser parser = new GnuParser();
+    CommandLine cli;
+
+    try {
+      cli = parser.parse(options, args);
+    } catch (MissingOptionException e) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("CrawlCompletionStats", options, true);
+      return 1;
+    }
 
-      System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1.");
+    if (cli.hasOption("help")) {
+      HelpFormatter formatter = new HelpFormatter();
+      formatter.printHelp("CrawlCompletionStats", options, true);
       return 1;
     }
 
-    String inputDir = args[0];
-    String outputDir = args[1];
-    int numOfReducers = 1;
+    String inputDir = cli.getOptionValue("inputDirs");
+    String outputDir = cli.getOptionValue("outputDir");
 
-    if (args.length > 3) {
+    int numOfReducers = 1;
+    if (cli.hasOption("numReducers")) {
       numOfReducers = Integer.parseInt(args[3]);
     }
 
@@ -91,13 +133,13 @@ public class CrawlCompletionStats extend
 
     int mode = 0;
     String jobName = "CrawlCompletionStats";
-    if (args[2].equals("host")) {
+    if (cli.getOptionValue("mode").equals("host")) {
       jobName = "Host CrawlCompletionStats";
       mode = MODE_HOST;
-    } else if (args[2].equals("domain")) {
+    } else if (cli.getOptionValue("mode").equals("domain")) {
       jobName = "Domain CrawlCompletionStats";
       mode = MODE_DOMAIN;
-    }
+    } 
 
     Configuration conf = getConf();
     conf.setInt("domain.statistics.mode", mode);
@@ -108,7 +150,7 @@ public class CrawlCompletionStats extend
 
     String[] inputDirsSpecs = inputDir.split(",");
     for (int i = 0; i < inputDirsSpecs.length; i++) {
-      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
+      File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
       FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
       
     }