You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/10/30 22:51:22 UTC

svn commit: r1711560 - in /nutch/trunk: CHANGES.txt src/bin/nutch src/java/org/apache/nutch/util/CrawlCompletionStats.java

Author: mattmann
Date: Fri Oct 30 21:51:22 2015
New Revision: 1711560

URL: http://svn.apache.org/viewvc?rev=1711560&view=rev
Log:
Fix for NUTCH-2155 Create a crawl completeness utility contributed by Michael Joyce <ml...@gmail.com> this closes #83

Added:
    nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/bin/nutch

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1711560&r1=1711559&r2=1711560&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Oct 30 21:51:22 2015
@@ -1,8 +1,10 @@
 Nutch Change Log
     
-Nutch Current Development 1.11 25/10/2015 (dd/mm/yyyy)
+Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2155 Create a "crawl completeness" utility (Michael Joyce via mattmann)
+
 * NUTCH-1988 Make nested output directory dump optional... again (Michael Joyce via lewismc)
 
 * NUTCH-1800 Documentation for Nutch 1.X and 2.X REST APIs (lewismc)

Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1711560&r1=1711559&r2=1711560&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Fri Oct 30 21:51:22 2015
@@ -80,6 +80,7 @@ if [ $# = 0 ]; then
   echo "  parsechecker      check the parser for a given url"
   echo "  indexchecker      check the indexing filters for a given url"
   echo "  domainstats       calculate domain statistics from crawldb"
+  echo "  crawlcomplete     calculate crawl completion stats from crawldb"
   echo "  webgraph          generate a web graph from existing segments"
   echo "  linkrank          run a link analysis program on the generated web graph"
   echo "  scoreupdater      updates the crawldb with linkrank scores"
@@ -260,6 +261,8 @@ elif [ "$COMMAND" = "indexchecker" ] ; t
   CLASS=org.apache.nutch.indexer.IndexingFiltersChecker
 elif [ "$COMMAND" = "domainstats" ] ; then 
   CLASS=org.apache.nutch.util.domain.DomainStatistics
+elif [ "$COMMAND" = "crawlcomplete" ] ; then
+  CLASS=org.apache.nutch.util.CrawlCompletionStats
 elif [ "$COMMAND" = "webgraph" ] ; then
   CLASS=org.apache.nutch.scoring.webgraph.WebGraph
 elif [ "$COMMAND" = "linkrank" ] ; then

Added: nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java?rev=1711560&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/CrawlCompletionStats.java Fri Oct 30 21:51:22 2015
@@ -0,0 +1,189 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * Extracts some simple crawl completion stats from the crawldb
+ *
+ * Stats will be sorted by host/domain and will be of the form:
+ * 1	www.spitzer.caltech.edu FETCHED
+ * 50	www.spitzer.caltech.edu UNFETCHED
+ *
+ */
+public class CrawlCompletionStats extends Configured implements Tool {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(CrawlCompletionStats.class);
+
+  private static final int MODE_HOST = 1;
+  private static final int MODE_DOMAIN = 2;
+
+  private int mode = 0;
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.out
+          .println("usage: CrawlCompletionStats inputDirs outDir host|domain [numOfReducer]");
+      return 1;
+    }
+    String inputDir = args[0];
+    String outputDir = args[1];
+    int numOfReducers = 1;
+
+    if (args.length > 3) {
+      numOfReducers = Integer.parseInt(args[3]);
+    }
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start));
+
+    int mode = 0;
+    String jobName = "CrawlCompletionStats";
+    if (args[2].equals("host")) {
+      jobName = "Host CrawlCompletionStats";
+      mode = MODE_HOST;
+    } else if (args[2].equals("domain")) {
+      jobName = "Domain CrawlCompletionStats";
+      mode = MODE_DOMAIN;
+    }
+
+    Configuration conf = getConf();
+    conf.setInt("domain.statistics.mode", mode);
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+
+    Job job = Job.getInstance(conf, jobName);
+    job.setJarByClass(CrawlCompletionStats.class);
+
+    String[] inputDirsSpecs = inputDir.split(",");
+    for (int i = 0; i < inputDirsSpecs.length; i++) {
+      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i]));
+    }
+
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    FileOutputFormat.setOutputPath(job, new Path(outputDir));
+    job.setOutputFormatClass(TextOutputFormat.class);
+
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(LongWritable.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+
+    job.setMapperClass(CrawlCompletionStatsMapper.class);
+    job.setReducerClass(CrawlCompletionStatsReducer.class);
+    job.setCombinerClass(CrawlCompletionStatsCombiner.class);
+    job.setNumReduceTasks(numOfReducers);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}",
+      sdf.format(end), TimingUtil.elapsedTime(start, end));
+    return 0;
+  }
+
+  static class CrawlCompletionStatsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
+    int mode = 0;
+
+    public void setup(Context context) {
+      mode = context.getConfiguration().getInt("domain.statistics.mode", MODE_DOMAIN);
+    }
+
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
+
+      URL url = new URL(urlText.toString());
+      String out = "";
+      switch (mode) {
+        case MODE_HOST:
+          out = url.getHost();
+          break;
+        case MODE_DOMAIN:
+          out = URLUtil.getDomainName(url);
+          break;
+      }
+
+      if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+          || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+        context.write(new Text(out + " FETCHED"), new LongWritable(1));
+      } else {
+        context.write(new Text(out + " UNFETCHED"), new LongWritable(1));
+      }
+    }
+  }
+
+  static class CrawlCompletionStatsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+
+      context.write(new LongWritable(total), key);
+    }
+  }
+
+  public static class CrawlCompletionStatsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+      long total = 0;
+
+      for (LongWritable val : values) {
+        total += val.get();
+      }
+      context.write(key, new LongWritable(total));
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(NutchConfiguration.create(), new CrawlCompletionStats(), args);
+  }
+}