You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ot...@apache.org on 2009/01/13 23:16:01 UTC
svn commit: r734257 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/Generator.java
Author: otis
Date: Tue Jan 13 14:15:58 2009
New Revision: 734257
URL: http://svn.apache.org/viewvc?rev=734257&view=rev
Log:
NUTCH-627 - Minimize host address lookup while running generate
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=734257&r1=734256&r2=734257&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Jan 13 14:15:58 2009
@@ -308,6 +308,8 @@
115. NUTCH-652 - AdaptiveFetchSchedule#setFetchSchedule doesn't calculate
fetch interval correctly. (dogacan)
+
+116. NUTCH-627 - Minimize host address lookup (Otis Gospodnetic)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=734257&r1=734256&r2=734257&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Jan 13 14:15:58 2009
@@ -88,6 +88,8 @@
private HashMap<String, IntWritable> hostCounts =
new HashMap<String, IntWritable>();
private int maxPerHost;
+ private HashSet<String> maxedHosts = new HashSet<String>();
+ private HashSet<String> dnsFailureHosts = new HashSet<String>();
private Partitioner<Text, Writable> hostPartitioner = new PartitionUrlByHost();
private URLFilters filters;
private URLNormalizers normalizers;
@@ -195,17 +197,28 @@
String host = u.getHost();
host = host.toLowerCase();
-
+ String hostname = host;
+
// partitioning by ip will generate lots of DNS requests here, and will
// be up to double the overall dns load, do not run this way unless you
// are running a local caching DNS server or a two layer DNS cache
if (byIP) {
+ if (maxedHosts.contains(host)) {
+ if (LOG.isDebugEnabled()) { LOG.debug("Host already maxed out: " + host); }
+ continue;
+ }
+ if (dnsFailureHosts.contains(host)) {
+ if (LOG.isDebugEnabled()) { LOG.debug("Host name lookup already failed: " + host); }
+ continue;
+ }
try {
InetAddress ia = InetAddress.getByName(host);
host = ia.getHostAddress();
urlString = new URL(u.getProtocol(), host, u.getPort(), u.getFile()).toString();
}
catch (UnknownHostException uhe) {
+ // remember hostnames that could not be looked up
+ dnsFailureHosts.add(hostname);
if (LOG.isDebugEnabled()) {
LOG.debug("DNS lookup failed: " + host + ", skipping.");
}
@@ -241,6 +254,8 @@
// skip URL if above the limit per host.
if (hostCount.get() > maxPerHost) {
if (hostCount.get() == maxPerHost + 1) {
+ // remember the raw hostname that is maxed out
+ maxedHosts.add(hostname);
if (LOG.isInfoEnabled()) {
LOG.info("Host " + host + " has more than " + maxPerHost +
" URLs." + " Skipping additional.");