You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2011/09/01 15:08:24 UTC
svn commit: r1164064 - in /nutch/branches/branch-1.4: CHANGES.txt
conf/nutch-default.xml src/java/org/apache/nutch/fetcher/Fetcher.java
Author: jnioche
Date: Thu Sep 1 13:08:23 2011
New Revision: 1164064
URL: http://svn.apache.org/viewvc?rev=1164064&view=rev
Log:
NUTCH-1073 Renamed parameters 'fetcher.threads.per.host.by.ip' and 'fetcher.threads.per.host'
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/conf/nutch-default.xml
nutch/branches/branch-1.4/src/java/org/apache/nutch/fetcher/Fetcher.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1164064&r1=1164063&r2=1164064&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Thu Sep 1 13:08:23 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1073 Rename parameters 'fetcher.threads.per.host.by.ip' and 'fetcher.threads.per.host' (jnioche)
+
* NUTCH-1089 Short compressed pages caused exception in protocol-httpclient (Simone Frenzel via jnioche)
* NUTCH-1085 Nutch script does not require HADOOP_HOME (jnioche)
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1164064&r1=1164063&r2=1164064&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Thu Sep 1 13:08:23 2011
@@ -632,20 +632,21 @@
</property>
<property>
- <name>fetcher.threads.per.host</name>
+ <name>fetcher.threads.per.queue</name>
<value>1</value>
<description>This number is the maximum number of threads that
- should be allowed to access a host at one time.</description>
+ should be allowed to access a queue at one time. Replaces
+ deprecated parameter 'fetcher.threads.per.host'.
+ </description>
</property>
<property>
- <name>fetcher.threads.per.host.by.ip</name>
- <value>false</value>
- <description>If true, then fetcher will count threads by IP address,
- to which the URL's host name resolves. If false, only host name will be
- used. NOTE: this should be set to the same value as
- "generate.max.per.host.by.ip" - default settings are different only for
- reasons of backward-compatibility.</description>
+ <name>fetcher.queue.mode</name>
+ <value>byHost</value>
+ <description>Determines how to put URLs into queues. Default value is 'byHost',
+ also takes 'byDomain' or 'byIP'. Replaces the deprecated parameter
+ 'fetcher.threads.per.host.by.ip'.
+ </description>
</property>
<property>
Modified: nutch/branches/branch-1.4/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1164064&r1=1164063&r2=1164064&view=diff
==============================================================================
--- nutch/branches/branch-1.4/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/branches/branch-1.4/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Sep 1 13:08:23 2011
@@ -148,11 +148,11 @@ public class Fetcher extends Configured
this.queueID = queueID;
}
- /** Create an item. Queue id will be created based on <code>byIP</code>
- * argument, either as a protocol + hostname pair, or protocol + IP
- * address pair.
+ /** Create an item. Queue id will be created based on <code>queueMode</code>
+ * argument, either as a protocol + hostname pair, protocol + IP
+ * address pair or protocol+domain pair.
*/
- public static FetchItem create(Text url, CrawlDatum datum, boolean byIP) {
+ public static FetchItem create(Text url, CrawlDatum datum, String queueMode) {
String queueID;
URL u = null;
try {
@@ -161,26 +161,33 @@ public class Fetcher extends Configured
LOG.warn("Cannot parse url: " + url, e);
return null;
}
- String proto = u.getProtocol().toLowerCase();
- String host;
- if (byIP) {
+ final String proto = u.getProtocol().toLowerCase();
+ String key;
+ if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) {
try {
- InetAddress addr = InetAddress.getByName(u.getHost());
- host = addr.getHostAddress();
- } catch (UnknownHostException e) {
+ final InetAddress addr = InetAddress.getByName(u.getHost());
+ key = addr.getHostAddress();
+ } catch (final UnknownHostException e) {
// unable to resolve it, so don't fall back to host name
LOG.warn("Unable to resolve: " + u.getHost() + ", skipping.");
return null;
}
- } else {
- host = u.getHost();
- if (host == null) {
- LOG.warn("Unknown host for url: " + url + ", skipping.");
- return null;
+ }
+ else if (FetchItemQueues.QUEUE_MODE_DOMAIN.equalsIgnoreCase(queueMode)){
+ key = URLUtil.getDomainName(u);
+ if (key == null) {
+ LOG.warn("Unknown domain for url: " + url + ", using URL string as key");
+ key=u.toExternalForm();
+ }
+ }
+ else {
+ key = u.getHost();
+ if (key == null) {
+ LOG.warn("Unknown host for url: " + url + ", using URL string as key");
+ key=u.toExternalForm();
}
- host = host.toLowerCase();
}
- queueID = proto + "://" + host;
+ queueID = proto + "://" + key.toLowerCase();
return new FetchItem(url, u, datum, queueID);
}
@@ -309,18 +316,30 @@ public class Fetcher extends Configured
Map<String, FetchItemQueue> queues = new HashMap<String, FetchItemQueue>();
AtomicInteger totalSize = new AtomicInteger(0);
int maxThreads;
- boolean byIP;
long crawlDelay;
long minCrawlDelay;
long timelimit = -1;
int maxExceptionsPerQueue = -1;
- Configuration conf;
+ Configuration conf;
+
+ public static final String QUEUE_MODE_HOST = "byHost";
+ public static final String QUEUE_MODE_DOMAIN = "byDomain";
+ public static final String QUEUE_MODE_IP = "byIP";
+
+ String queueMode;
public FetchItemQueues(Configuration conf) {
this.conf = conf;
- this.maxThreads = conf.getInt("fetcher.threads.per.host", 1);
- // backward-compatible default setting
- this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", false);
+ this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
+ queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
+ // check that the mode is known
+ if (!queueMode.equals(QUEUE_MODE_IP) && !queueMode.equals(QUEUE_MODE_DOMAIN)
+ && !queueMode.equals(QUEUE_MODE_HOST)) {
+ LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
+ queueMode = QUEUE_MODE_HOST;
+ }
+ LOG.info("Using queue mode : "+queueMode);
+
this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
this.timelimit = conf.getLong("fetcher.timelimit", -1);
@@ -336,7 +355,7 @@ public class Fetcher extends Configured
}
public void addFetchItem(Text url, CrawlDatum datum) {
- FetchItem it = FetchItem.create(url, datum, byIP);
+ FetchItem it = FetchItem.create(url, datum, queueMode);
if (it != null) addFetchItem(it);
}
@@ -535,7 +554,7 @@ public class Fetcher extends Configured
private URLNormalizers normalizers;
private ProtocolFactory protocolFactory;
private long maxCrawlDelay;
- private boolean byIP;
+ private String queueMode;
private int maxRedirect;
private String reprUrl;
private boolean redirecting;
@@ -552,7 +571,14 @@ public class Fetcher extends Configured
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
- this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", false);
+ queueMode = conf.get("fetcher.queue.mode", FetchItemQueues.QUEUE_MODE_HOST);
+ // check that the mode is known
+ if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP) && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
+ && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
+ LOG.error("Unknown partition mode : " + queueMode + " - forcing to byHost");
+ queueMode = FetchItemQueues.QUEUE_MODE_HOST;
+ }
+ LOG.info("Using queue mode : "+queueMode);
this.maxRedirect = conf.getInt("http.redirect.max", 3);
this.ignoreExternalLinks =
conf.getBoolean("db.ignore.external.links", false);
@@ -665,7 +691,7 @@ public class Fetcher extends Configured
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
new Text(reprUrl));
}
- fit = FetchItem.create(redirUrl, newDatum, byIP);
+ fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
FetchItemQueue fiq =
fetchQueues.getFetchItemQueue(fit.queueID);
@@ -706,7 +732,7 @@ public class Fetcher extends Configured
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
new Text(reprUrl));
}
- fit = FetchItem.create(redirUrl, newDatum, byIP);
+ fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
FetchItemQueue fiq =
fetchQueues.getFetchItemQueue(fit.queueID);