You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/02 15:50:37 UTC

svn commit: r722478 - /lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java

Author: kubes
Date: Tue Dec  2 06:50:37 2008
New Revision: 722478

URL: http://svn.apache.org/viewvc?rev=722478&view=rev
Log:
NUTCH-647: Resolve URLs tool

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java

Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=722478&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Tue Dec  2 06:50:37 2008
@@ -0,0 +1,184 @@
+package org.apache.nutch.tools;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.net.InetAddress;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * A simple tool that will spin up multiple threads to resolve urls to ip
+ * addresses. This can be used to verify that pages that are failing due to
+ * UnknownHostException during fetching are actually bad and are not failing due
+ * to a dns problem in fetching.
+ */
+public class ResolveUrls {
+
+  public static final Log LOG = LogFactory.getLog(ResolveUrls.class);
+
+  private String urlsFile = null;
+  private int numThreads = 100;
+  private ExecutorService pool = null;
+  private static AtomicInteger numTotal = new AtomicInteger(0);
+  private static AtomicInteger numErrored = new AtomicInteger(0);
+  private static AtomicInteger numResolved = new AtomicInteger(0);
+  private static AtomicLong totalTime = new AtomicLong(0L);
+
+  /**
+   * A Thread which gets the ip address of a single host by name.
+   */
+  private static class ResolverThread
+    extends Thread {
+
+    private String url = null;
+
+    public ResolverThread(String url) {
+      this.url = url;
+    }
+
+    public void run() {
+
+      numTotal.incrementAndGet();
+      String host = URLUtil.getHost(url);
+      long start = System.currentTimeMillis();
+      try {
+        
+        // get the address by name and if no error is thrown then it 
+        // is resolved successfully
+        InetAddress ia = InetAddress.getByName(host);
+        LOG.info("Resolved: " + host);
+        numResolved.incrementAndGet();
+      }
+      catch (Exception uhe) {
+        LOG.info("Error Resolving: " + host);
+        numErrored.incrementAndGet();
+      }
+      long end = System.currentTimeMillis();
+      long total = (end - start);
+      totalTime.addAndGet(total);
+      LOG.info(", " + total + " millis");
+    }
+  }
+
+  /**
+   * Creates a thread pool for resolving urls.  Reads in the url file on the
+   * local filesystem.  For each url it attempts to resolve it keeping a total
+   * account of the number resolved, errored, and the amount of time.
+   */
+  public void resolveUrls() {
+
+    try {
+
+      // create a thread pool with a fixed number of threads
+      pool = Executors.newFixedThreadPool(numThreads);
+      
+      // read in the urls file and loop through each line, one url per line
+      BufferedReader buffRead = new BufferedReader(new FileReader(new File(
+        urlsFile)));
+      String urlStr = null;
+      while ((urlStr = buffRead.readLine()) != null) {
+        
+        // spin up a resolver thread per url
+        LOG.info("Starting: " + urlStr);
+        pool.execute(new ResolverThread(urlStr));
+      }
+
+      // close the file and wait for up to 60 seconds before shutting down
+      // the thread pool to give urls time to finish resolving
+      buffRead.close();
+      pool.awaitTermination(60, TimeUnit.SECONDS);
+    }
+    catch (Exception e) {
+      
+      // on error shutdown the thread pool immediately
+      pool.shutdownNow();
+      LOG.info(StringUtils.stringifyException(e));
+    }
+
+    // shutdown the thread pool and log totals
+    pool.shutdown();
+    LOG.info("Total: " + numTotal.get() + ", Resovled: "
+      + numResolved.get() + ", Errored: " + numErrored.get()
+      + ", Average Time: " + totalTime.get() / numTotal.get());
+  }
+
+  /**
+   * Create a new ResolveUrls with a file from the local file system.
+   *
+   * @param urlsFile The local urls file, one url per line.
+   */
+  public ResolveUrls(String urlsFile) {
+    this(urlsFile, 100);
+  }
+
+  /**
+   * Create a new ResolveUrls with a urls file and a number of threads for the
+   * Thread pool.  Number of threads is 100 by default.
+   * 
+   * @param urlsFile The local urls file, one url per line.
+   * @param numThreads The number of threads used to resolve urls in parallel.
+   */
+  public ResolveUrls(String urlsFile, int numThreads) {
+    this.urlsFile = urlsFile;
+    this.numThreads = numThreads;
+  }
+
+  /**
+   * Runs the resolve urls tool.
+   */
+  public static void main(String[] args) {
+
+    Options options = new Options();
+    Option helpOpts = OptionBuilder.withArgName("help").withDescription(
+      "show this help message").create("help");
+    Option urlOpts = OptionBuilder.withArgName("urls").hasArg().withDescription(
+      "the urls file to check").create("urls");
+    Option numThreadOpts = OptionBuilder.withArgName("numThreads").hasArgs().withDescription(
+      "the number of threads to use").create("numThreads");
+    options.addOption(helpOpts);
+    options.addOption(urlOpts);
+    options.addOption(numThreadOpts);
+
+    CommandLineParser parser = new GnuParser();
+    try {
+
+      // parse out common line arguments
+      CommandLine line = parser.parse(options, args);
+      if (line.hasOption("help") || !line.hasOption("urls")) {
+        HelpFormatter formatter = new HelpFormatter();
+        formatter.printHelp("ResolveUrls", options);
+        return;
+      }
+
+      // get the urls and the number of threads and start the resolver
+      String urls = line.getOptionValue("urls");
+      int numThreads = 100;
+      String numThreadsStr = line.getOptionValue("numThreads");
+      if (numThreadsStr != null) {
+        numThreads = Integer.parseInt(numThreadsStr);
+      }
+      ResolveUrls resolve = new ResolveUrls(urls, numThreads);
+      resolve.resolveUrls();
+    }
+    catch (Exception e) {
+      LOG.fatal("ResolveUrls: " + StringUtils.stringifyException(e));
+    }
+  }
+
+}