You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/12/02 15:50:37 UTC
svn commit: r722478 -
/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
Author: kubes
Date: Tue Dec 2 06:50:37 2008
New Revision: 722478
URL: http://svn.apache.org/viewvc?rev=722478&view=rev
Log:
NUTCH-647: Resolve URLs tool
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=722478&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Tue Dec 2 06:50:37 2008
@@ -0,0 +1,184 @@
+package org.apache.nutch.tools;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.net.InetAddress;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.URLUtil;
+
+/**
+ * A simple tool that will spin up multiple threads to resolve urls to ip
+ * addresses. This can be used to verify that pages that are failing due to
+ * UnknownHostException during fetching are actually bad and are not failing due
+ * to a dns problem in fetching.
+ */
+public class ResolveUrls {
+
+ public static final Log LOG = LogFactory.getLog(ResolveUrls.class);
+
+ private String urlsFile = null;
+ private int numThreads = 100;
+ private ExecutorService pool = null;
+ private static AtomicInteger numTotal = new AtomicInteger(0);
+ private static AtomicInteger numErrored = new AtomicInteger(0);
+ private static AtomicInteger numResolved = new AtomicInteger(0);
+ private static AtomicLong totalTime = new AtomicLong(0L);
+
+ /**
+ * A Thread which gets the ip address of a single host by name.
+ */
+ private static class ResolverThread
+ extends Thread {
+
+ private String url = null;
+
+ public ResolverThread(String url) {
+ this.url = url;
+ }
+
+ public void run() {
+
+ numTotal.incrementAndGet();
+ String host = URLUtil.getHost(url);
+ long start = System.currentTimeMillis();
+ try {
+
+ // get the address by name and if no error is thrown then it
+ // is resolved successfully
+ InetAddress ia = InetAddress.getByName(host);
+ LOG.info("Resolved: " + host);
+ numResolved.incrementAndGet();
+ }
+ catch (Exception uhe) {
+ LOG.info("Error Resolving: " + host);
+ numErrored.incrementAndGet();
+ }
+ long end = System.currentTimeMillis();
+ long total = (end - start);
+ totalTime.addAndGet(total);
+ LOG.info(", " + total + " millis");
+ }
+ }
+
+ /**
+ * Creates a thread pool for resolving urls. Reads in the url file on the
+ * local filesystem. For each url it attempts to resolve it keeping a total
+ * account of the number resolved, errored, and the amount of time.
+ */
+ public void resolveUrls() {
+
+ try {
+
+ // create a thread pool with a fixed number of threads
+ pool = Executors.newFixedThreadPool(numThreads);
+
+ // read in the urls file and loop through each line, one url per line
+ BufferedReader buffRead = new BufferedReader(new FileReader(new File(
+ urlsFile)));
+ String urlStr = null;
+ while ((urlStr = buffRead.readLine()) != null) {
+
+ // spin up a resolver thread per url
+ LOG.info("Starting: " + urlStr);
+ pool.execute(new ResolverThread(urlStr));
+ }
+
+ // close the file and wait for up to 60 seconds before shutting down
+ // the thread pool to give urls time to finish resolving
+ buffRead.close();
+ pool.awaitTermination(60, TimeUnit.SECONDS);
+ }
+ catch (Exception e) {
+
+ // on error shutdown the thread pool immediately
+ pool.shutdownNow();
+ LOG.info(StringUtils.stringifyException(e));
+ }
+
+ // shutdown the thread pool and log totals
+ pool.shutdown();
+ LOG.info("Total: " + numTotal.get() + ", Resovled: "
+ + numResolved.get() + ", Errored: " + numErrored.get()
+ + ", Average Time: " + totalTime.get() / numTotal.get());
+ }
+
+ /**
+ * Create a new ResolveUrls with a file from the local file system.
+ *
+ * @param urlsFile The local urls file, one url per line.
+ */
+ public ResolveUrls(String urlsFile) {
+ this(urlsFile, 100);
+ }
+
+ /**
+ * Create a new ResolveUrls with a urls file and a number of threads for the
+ * Thread pool. Number of threads is 100 by default.
+ *
+ * @param urlsFile The local urls file, one url per line.
+ * @param numThreads The number of threads used to resolve urls in parallel.
+ */
+ public ResolveUrls(String urlsFile, int numThreads) {
+ this.urlsFile = urlsFile;
+ this.numThreads = numThreads;
+ }
+
+ /**
+ * Runs the resolve urls tool.
+ */
+ public static void main(String[] args) {
+
+ Options options = new Options();
+ Option helpOpts = OptionBuilder.withArgName("help").withDescription(
+ "show this help message").create("help");
+ Option urlOpts = OptionBuilder.withArgName("urls").hasArg().withDescription(
+ "the urls file to check").create("urls");
+ Option numThreadOpts = OptionBuilder.withArgName("numThreads").hasArgs().withDescription(
+ "the number of threads to use").create("numThreads");
+ options.addOption(helpOpts);
+ options.addOption(urlOpts);
+ options.addOption(numThreadOpts);
+
+ CommandLineParser parser = new GnuParser();
+ try {
+
+ // parse out common line arguments
+ CommandLine line = parser.parse(options, args);
+ if (line.hasOption("help") || !line.hasOption("urls")) {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("ResolveUrls", options);
+ return;
+ }
+
+ // get the urls and the number of threads and start the resolver
+ String urls = line.getOptionValue("urls");
+ int numThreads = 100;
+ String numThreadsStr = line.getOptionValue("numThreads");
+ if (numThreadsStr != null) {
+ numThreads = Integer.parseInt(numThreadsStr);
+ }
+ ResolveUrls resolve = new ResolveUrls(urls, numThreads);
+ resolve.resolveUrls();
+ }
+ catch (Exception e) {
+ LOG.fatal("ResolveUrls: " + StringUtils.stringifyException(e));
+ }
+ }
+
+}