You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2014/09/23 15:45:49 UTC
svn commit: r1627028 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/protocol/Protocol.java src/java/org/apache/nutch/scoring/webgraph/Loops.java src/java/org/apache/nutch/scoring/webgraph/WebGraph.java

Author: lewismc
Date: Tue Sep 23 13:45:48 2014
New Revision: 1627028

URL: http://svn.apache.org/r1627028
Log:
NUTCH-1839 Improve WebGraph CLI parsing

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1627028&r1=1627027&r2=1627028&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Sep 23 13:45:48 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1839 Improve WebGraph CLI parsing (lewismc)
+
 * NUTCH-1526 Create SegmentContentDumperTool for easily extracting out file contents from SegmentDirs (mattmann, lewismc, Julien Le Dem)
 
 * NUTCH-1840 the describe function in SolrIndexWriter is not correct (kaveh minooie via jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=1627028&r1=1627027&r2=1627028&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Tue Sep 23 13:45:48 2014
@@ -28,7 +28,7 @@ import org.apache.nutch.plugin.Pluggable
 import crawlercommons.robots.BaseRobotRules;
 
 
-/** A retriever of url content.  Implemented by protocol extensions. */
+/** A retriever of url content. Implemented by protocol extensions. */
 public interface Protocol extends Pluggable, Configurable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=1627028&r1=1627027&r2=1627028&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Tue Sep 23 13:45:48 2014
@@ -72,7 +72,7 @@ import org.apache.nutch.util.TimingUtil;
  * This job will identify both reciprocal links and cycles of 2+ links up to a
  * set depth to check. The Loops job is expensive in both computational and
  * space terms. Because it checks outlinks of outlinks of outlinks for cycles
- * its intermediate output can be extremly large even if the end output is
+ * its intermediate output can be extremely large even if the end output is
  * rather small. Because of this the Loops job is optional and if it doesn't
  * exist then it won't be factored into the LinkRank program.
  */

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=1627028&r1=1627027&r2=1627028&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java Tue Sep 23 13:45:48 2014
@@ -89,13 +89,11 @@ import org.apache.nutch.util.URLUtil;
  * when the WebGraph is updated. The Node database is created from both the
  * Inlink and Outlink databases. Because the Node database is overwritten when
  * the WebGraph is updated and because the Node database holds current scores
- * for urls it is recommended that a crawl-cyle (one or more full crawls) fully
+ * for urls it is recommended that a crawl-cycle (one or more full crawls) fully
  * complete before the WebGraph is updated and some type of analysis, such as
  * LinkRank, is run to update scores in the Node database in a stable fashion.
  */
-public class WebGraph
-  extends Configured
-  implements Tool {
+public class WebGraph extends Configured implements Tool {
 
   public static final Logger LOG = LoggerFactory.getLogger(WebGraph.class);
   public static final String LOCK_NAME = ".locked";
@@ -109,10 +107,8 @@ public class WebGraph
    * by domain and host can be ignored. The number of Outlinks out to a given
    * page or domain can also be limited.
    */
-  public static class OutlinkDb
-    extends Configured
-    implements Mapper<Text, Writable, Text, NutchWritable>,
-    Reducer<Text, NutchWritable, Text, LinkDatum> {
+  public static class OutlinkDb extends Configured implements 
+      Mapper<Text, Writable, Text, NutchWritable>, Reducer<Text, NutchWritable, Text, LinkDatum> {
 
     public static final String URL_NORMALIZING = "webgraph.url.normalizers";
     public static final String URL_FILTERING = "webgraph.url.filters";
@@ -153,7 +149,7 @@ public class WebGraph
 
           // normalize and trim the url
           normalized = urlNormalizers.normalize(url,
-            URLNormalizers.SCOPE_DEFAULT);
+              URLNormalizers.SCOPE_DEFAULT);
           normalized = normalized.trim();
         }
         catch (Exception e) {
@@ -200,7 +196,6 @@ public class WebGraph
       long fetchTime = System.currentTimeMillis();
       String fetchTimeStr = data.getContentMeta().get(Nutch.FETCH_TIME_KEY);
       try {
-
         // get the fetch time from the parse data
         fetchTime = Long.parseLong(fetchTimeStr);
       }
@@ -249,9 +244,8 @@ public class WebGraph
      * Passes through existing LinkDatum objects from an existing OutlinkDb and
      * maps out new LinkDatum objects from new crawls ParseData.
      */
-    public void map(Text key, Writable value,
-      OutputCollector<Text, NutchWritable> output, Reporter reporter)
-      throws IOException {
+    public void map(Text key, Writable value, OutputCollector<Text, NutchWritable> 
+        output, Reporter reporter) throws IOException {
 
       // normalize url, stop processing if null
       String url = normalizeUrl(key.toString());
@@ -274,8 +268,8 @@ public class WebGraph
             datum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
             datum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
 
-            // Tell the reducer to get rid of all instances of this key
-            output.collect(key, new NutchWritable(new BooleanWritable(true)));
+          // Tell the reducer to get rid of all instances of this key
+          output.collect(key, new NutchWritable(new BooleanWritable(true)));
         }
       }
       else if (value instanceof ParseData) {
@@ -301,7 +295,7 @@ public class WebGraph
             // url is existing
             boolean existingUrl = outlinkMap.containsKey(toUrl);
             if (toUrl != null
-              && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
+                && (!existingUrl || (existingUrl && outlinkMap.get(toUrl) == null))) {
               outlinkMap.put(toUrl, outlink.getAnchor());
             }
           }
@@ -328,8 +322,8 @@ public class WebGraph
     }
 
     public void reduce(Text key, Iterator<NutchWritable> values,
-      OutputCollector<Text, LinkDatum> output, Reporter reporter)
-      throws IOException {
+        OutputCollector<Text, LinkDatum> output, Reporter reporter)
+            throws IOException {
 
       // aggregate all outlinks, get the most recent timestamp for a fetch
       // which should be the timestamp for all of the most recent outlinks
@@ -381,10 +375,10 @@ public class WebGraph
         // outlinks must be the most recent and conform to internal url and
         // limiting rules, if it does collect it
         if (datum.getTimestamp() == mostRecent
-          && (!limitPages || (limitPages && !pages.contains(toPage)))
-          && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
-          && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
-          && (!ignoreDomain || (ignoreDomain && !toDomain.equalsIgnoreCase(domain)))) {
+            && (!limitPages || (limitPages && !pages.contains(toPage)))
+            && (!limitDomains || (limitDomains && !domains.contains(toDomain)))
+            && (!ignoreHost || (ignoreHost && !toHost.equalsIgnoreCase(host)))
+            && (!ignoreDomain || (ignoreDomain && !toDomain.equalsIgnoreCase(domain)))) {
           output.collect(key, datum);
           pages.add(toPage);
           domains.add(toDomain);
@@ -401,9 +395,7 @@ public class WebGraph
    * OutlinkDb LinkDatum objects and are regenerated each time the WebGraph is
    * updated.
    */
-  private static class InlinkDb
-    extends Configured
-    implements Mapper<Text, LinkDatum, Text, LinkDatum> {
+  private static class InlinkDb extends Configured implements Mapper<Text, LinkDatum, Text, LinkDatum> {
 
     private long timestamp;
 
@@ -422,9 +414,8 @@ public class WebGraph
      * Inverts the Outlink LinkDatum objects into new LinkDatum objects with a
      * new system timestamp, type and to and from url switched.
      */
-    public void map(Text key, LinkDatum datum,
-      OutputCollector<Text, LinkDatum> output, Reporter reporter)
-      throws IOException {
+    public void map(Text key, LinkDatum datum, OutputCollector<Text, LinkDatum> 
+        output, Reporter reporter) throws IOException {
 
       // get the to and from url and the anchor
       String fromUrl = key.toString();
@@ -442,9 +433,7 @@ public class WebGraph
    * Creates the Node database which consists of the number of in and outlinks
    * for each url and a score slot for analysis programs such as LinkRank.
    */
-  private static class NodeDb
-    extends Configured
-    implements Reducer<Text, LinkDatum, Text, Node> {
+  private static class NodeDb extends Configured implements Reducer<Text, LinkDatum, Text, Node> {
 
     /**
      * Configures job.
@@ -457,9 +446,8 @@ public class WebGraph
      * Counts the number of inlinks and outlinks for each url and sets a default
      * score of 0.0 for each url (node) in the webgraph.
      */
-    public void reduce(Text key, Iterator<LinkDatum> values,
-      OutputCollector<Text, Node> output, Reporter reporter)
-      throws IOException {
+    public void reduce(Text key, Iterator<LinkDatum> values, OutputCollector<Text, Node> 
+        output, Reporter reporter) throws IOException {
 
       Node node = new Node();
       int numInlinks = 0;
@@ -498,7 +486,7 @@ public class WebGraph
    * @throws IOException If an error occurs while processing the WebGraph.
    */
   public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter)
-    throws IOException {
+      throws IOException {
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -529,7 +517,7 @@ public class WebGraph
     }
 
     Path tempOutlinkDb = new Path(outlinkDb + "-"
-      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
     JobConf outlinkJob = new NutchJob(conf);
     outlinkJob.setJobName("Outlinkdb: " + outlinkDb);
 
@@ -588,7 +576,7 @@ public class WebGraph
       LOG.info("OutlinkDb: finished");
     }
     catch (IOException e) {
-      
+
       // remove lock file and and temporary directory if an error occurs
       LockUtil.removeLockFile(fs, lock);
       if (fs.exists(tempOutlinkDb)) {
@@ -601,7 +589,7 @@ public class WebGraph
     // inlink and temp link database paths
     Path inlinkDb = new Path(webGraphDb, INLINK_DIR);
     Path tempInlinkDb = new Path(inlinkDb + "-"
-      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
     JobConf inlinkJob = new NutchJob(conf);
     inlinkJob.setJobName("Inlinkdb " + inlinkDb);
@@ -618,7 +606,7 @@ public class WebGraph
     inlinkJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
 
     try {
-      
+
       // run the inlink and replace any old with new
       LOG.info("InlinkDb: running");
       JobClient.runJob(inlinkJob);
@@ -627,7 +615,7 @@ public class WebGraph
       LOG.info("InlinkDb: finished");
     }
     catch (IOException e) {
-      
+
       // remove lock file and and temporary directory if an error occurs
       LockUtil.removeLockFile(fs, lock);
       if (fs.exists(tempInlinkDb)) {
@@ -640,7 +628,7 @@ public class WebGraph
     // node and temp node database paths
     Path nodeDb = new Path(webGraphDb, NODE_DIR);
     Path tempNodeDb = new Path(nodeDb + "-"
-      + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+        + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
     JobConf nodeJob = new NutchJob(conf);
     nodeJob.setJobName("NodeDb " + nodeDb);
@@ -659,7 +647,7 @@ public class WebGraph
     nodeJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
 
     try {
-      
+
       // run the node job and replace old nodedb with new
       LOG.info("NodeDb: running");
       JobClient.runJob(nodeJob);
@@ -668,7 +656,7 @@ public class WebGraph
       LOG.info("NodeDb: finished");
     }
     catch (IOException e) {
-      
+
       // remove lock file and and temporary directory if an error occurs
       LockUtil.removeLockFile(fs, lock);
       if (fs.exists(tempNodeDb)) {
@@ -685,8 +673,7 @@ public class WebGraph
     LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
   }
 
-  public static void main(String[] args)
-    throws Exception {
+  public static void main(String[] args) throws Exception {
     int res = ToolRunner.run(NutchConfiguration.create(), new WebGraph(), args);
     System.exit(res);
   }
@@ -694,51 +681,43 @@ public class WebGraph
   /**
    * Parses command link arguments and runs the WebGraph jobs.
    */
-  public int run(String[] args)
-    throws Exception {
+  public int run(String[] args) throws Exception {
 
-    Options options = new Options();
-    OptionBuilder.withArgName("help");
-    OptionBuilder.withDescription("show this help message");
-    Option helpOpts = OptionBuilder.create("help");
-    options.addOption(helpOpts);
-    
-    OptionBuilder.withArgName("webgraphdb");
-    OptionBuilder.hasArg();
-    OptionBuilder.withDescription("the web graph database to use");
-    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
-    options.addOption(webGraphDbOpts);
-    
-    OptionBuilder.withArgName("segment");
-    OptionBuilder.hasArgs();
-    OptionBuilder.withDescription("the segment(s) to use");
-    Option segOpts = OptionBuilder.create("segment");
-    options.addOption(segOpts);
+    //boolean options
+    Option helpOpt = new Option("h", "help", false, "show this help message");
+    Option normOpt = new Option("n", "normalize", false, "whether to use URLNormalizers on the URL's in the segment");
+    Option filtOpt = new Option("f", "filter", false, "whether to use URLFilters on the URL's in the segment");
+
+    //argument options
+    @SuppressWarnings("static-access")
+    Option graphOpt = OptionBuilder.withArgName("webgraphdb")
+        .hasArg().withDescription("the web graph database to create (if none exists) or use if one does")
+        .create("webgraphdb");
+    @SuppressWarnings("static-access")
+    Option segOpt = OptionBuilder.withArgName("segment")
+        .hasArgs().withDescription("the segment(s) to use")
+        .create("segment");
+    @SuppressWarnings("static-access")
+    Option segDirOpt = OptionBuilder.withArgName("segmentDir")
+        .hasArgs().withDescription("the segment directory to use")
+        .create("segmentDir");
     
-    OptionBuilder.withArgName("segmentDir");
-    OptionBuilder.hasArgs();
-    OptionBuilder.withDescription("the segment directory to use");
-    Option segDirOpts = OptionBuilder.create("segmentDir");
-    options.addOption(segDirOpts);
-    
-    OptionBuilder.withArgName("normalize");
-    OptionBuilder.withDescription("whether to use URLNormalizers on the URL's in the segment");
-    Option normalizeOpts = OptionBuilder.create("normalize");
-    options.addOption(normalizeOpts);
-    
-    OptionBuilder.withArgName("filter");
-    OptionBuilder.withDescription("whether to use URLFilters on the URL's in the segment");
-    Option filterOpts = OptionBuilder.create("filter");
-    options.addOption(filterOpts);
+    //create the options
+    Options options = new Options();
+    options.addOption(helpOpt);
+    options.addOption(normOpt);
+    options.addOption(filtOpt);
+    options.addOption(graphOpt);
+    options.addOption(segOpt);
+    options.addOption(segDirOpt);
 
     CommandLineParser parser = new GnuParser();
     try {
-
       CommandLine line = parser.parse(options, args);
       if (line.hasOption("help") || !line.hasOption("webgraphdb")
-        || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) {
+          || (!line.hasOption("segment") && !line.hasOption("segmentDir"))) {
         HelpFormatter formatter = new HelpFormatter();
-        formatter.printHelp("WebGraph", options);
+        formatter.printHelp("WebGraph", options, true);
         return -1;
       }