You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ku...@apache.org on 2008/02/12 15:51:37 UTC
svn commit: r620817 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/Generator.java
Author: kubes
Date: Tue Feb 12 06:51:33 2008
New Revision: 620817
URL: http://svn.apache.org/viewvc?rev=620817&view=rev
Log:
NUTCH-606 - Refactoring of Generator, run all urls through checks.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=620817&r1=620816&r2=620817&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 12 06:51:33 2008
@@ -204,6 +204,8 @@
72. NUTCH-608 - Upgrade nutch to use released apache-tika-0.1-incubating (mattmann)
+73. NUTCH-606 - Refactoring of Generator, run all urls through checks (kubes)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=620817&r1=620816&r2=620817&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue Feb 12 06:51:33 2008
@@ -181,56 +181,63 @@
while (values.hasNext() && count < limit) {
SelectorEntry entry = (SelectorEntry)values.next();
- Text url = entry.url;
-
- if (maxPerHost > 0) { // are we counting hosts?
- URL u = null;
+ Text url = entry.url;
+ String urlString = url.toString();
+ URL u = null;
+
+ // skip bad urls, including empty and null urls
+ try {
+ u = new URL(url.toString());
+ } catch (MalformedURLException e) {
+ LOG.info("Bad protocol in url: " + url.toString());
+ continue;
+ }
+
+ String host = u.getHost();
+ host = host.toLowerCase();
+
+ // partitioning by ip will generate lots of DNS requests here, and will
+ // be up to double the overall dns load, do not run this way unless you
+ // are running a local caching DNS server or a two layer DNS cache
+ if (byIP) {
try {
- u = new URL(url.toString());
- } catch (MalformedURLException e) {
- LOG.info("Bad protocol in url: " + url.toString());
- continue;
- }
- String host = u.getHost();
- if (host == null) {
- // unknown host, skip
- continue;
- }
- host = host.toLowerCase();
- if (byIP) {
- try {
- InetAddress ia = InetAddress.getByName(host);
- host = ia.getHostAddress();
- } catch (UnknownHostException uhe) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("DNS lookup failed: " + host + ", skipping.");
- }
- dnsFailure++;
- if ((dnsFailure % 1000 == 0) && (LOG.isWarnEnabled())) {
- LOG.warn("DNS failures: " + dnsFailure);
- }
- continue;
+ InetAddress ia = InetAddress.getByName(host);
+ host = ia.getHostAddress();
+ urlString = new URL(u.getProtocol(), host, u.getPort(), u.getFile()).toString();
+ }
+ catch (UnknownHostException uhe) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("DNS lookup failed: " + host + ", skipping.");
+ }
+ dnsFailure++;
+ if ((dnsFailure % 1000 == 0) && (LOG.isWarnEnabled())) {
+ LOG.warn("DNS failures: " + dnsFailure);
}
- }
- u = new URL(u.getProtocol(), host, u.getPort(), u.getFile());
- String urlString = u.toString();
- try {
- urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
- host = new URL(urlString).getHost();
- } catch (Exception e) {
- LOG.warn("Malformed URL: '" + urlString + "', skipping (" +
- StringUtils.stringifyException(e) + ")");
continue;
}
+ }
+
+ try {
+ urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+ host = new URL(urlString).getHost();
+ } catch (Exception e) {
+ LOG.warn("Malformed URL: '" + urlString + "', skipping (" +
+ StringUtils.stringifyException(e) + ")");
+ continue;
+ }
+
+ // only filter if we are counting hosts
+ if (maxPerHost > 0) {
+
IntWritable hostCount = hostCounts.get(host);
if (hostCount == null) {
hostCount = new IntWritable();
hostCounts.put(host, hostCount);
}
-
+
// increment hostCount
hostCount.set(hostCount.get() + 1);
-
+
// skip URL if above the limit per host.
if (hostCount.get() > maxPerHost) {
if (hostCount.get() == maxPerHost + 1) {
@@ -249,9 +256,7 @@
// maxPerHost may cause us to skip it.
count++;
}
-
}
-
}
public static class DecreasingFloatComparator extends FloatWritable.Comparator {
@@ -450,13 +455,15 @@
}
}
}
+
+ for (int i = 0; i < readers.length; i++) readers[i].close();
+
if (empty) {
LOG.warn("Generator: 0 records selected for fetching, exiting ...");
LockUtil.removeLockFile(fs, lock);
fs.delete(tempDir);
return null;
}
- for (int i = 0; i < readers.length; i++) readers[i].close();
// invert again, paritition by host, sort by url hash
if (LOG.isInfoEnabled()) {