You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/04/17 22:49:20 UTC
svn commit: r1674399 - in /nutch/trunk: ./ conf/
src/java/org/apache/nutch/protocol/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/
Author: snagel
Date: Fri Apr 17 20:49:19 2015
New Revision: 1674399
URL: http://svn.apache.org/r1674399
Log:
NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing
Removed:
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Apr 17 20:49:19 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-1927 Create a whitelist of IPs/hostnames to allow skipping of RobotRules parsing (mattmann, snagel)
+
* NUTCH-1986 Clarify Elastic Search Indexer Plugin Settings (Michael Joyce via mattmann)
* NUTCH-1906 Typo in CrawlDbReader command line help (Michael Joyce via mattmann)
Modified: nutch/trunk/conf/log4j.properties
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Fri Apr 17 20:49:19 2015
@@ -54,6 +54,7 @@ log4j.logger.org.apache.nutch.indexer.In
log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout
+log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
log4j.logger.org.apache.nutch=INFO
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Apr 17 20:49:19 2015
@@ -118,6 +118,15 @@
</property>
<property>
+ <name>http.robot.rules.whitelist</name>
+ <value></value>
+ <description>Comma separated list of hostnames or IP addresses to ignore
+ robot rules parsing for. Use with care and only if you are explicitly
+ allowed by the site owner to ignore the site's robots.txt!
+ </description>
+</property>
+
+<property>
<name>http.robots.403.allow</name>
<value>true</value>
<description>Some servers return HTTP status 403 (Forbidden) if
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Apr 17 20:49:19 2015
@@ -20,10 +20,15 @@ package org.apache.nutch.protocol;
// JDK imports
import java.io.File;
import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.LineNumberReader;
+import java.net.MalformedURLException;
import java.net.URL;
-import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
import java.util.Hashtable;
+import java.util.Set;
import java.util.StringTokenizer;
// Commons Logging imports
@@ -32,10 +37,11 @@ import org.slf4j.LoggerFactory;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
-
-import com.google.common.io.Files;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.NutchConfiguration;
import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;
@@ -46,8 +52,11 @@ import crawlercommons.robots.SimpleRobot
* This class uses crawler-commons for handling the parsing of
* {@code robots.txt} files. It emits SimpleRobotRules objects, which describe
* the download permissions as described in SimpleRobotRulesParser.
+ *
+ * Protocol-specific implementations have to implement the method
+ * {@link getRobotRulesSet}.
*/
-public abstract class RobotRulesParser implements Configurable {
+public abstract class RobotRulesParser implements Tool {
public static final Logger LOG = LoggerFactory
.getLogger(RobotRulesParser.class);
@@ -70,9 +79,13 @@ public abstract class RobotRulesParser i
RobotRulesMode.ALLOW_NONE);
private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
- private Configuration conf;
+ protected Configuration conf;
protected String agentNames;
+ /** set of host names or IPs to be explicitly excluded from robots.txt checking */
+ protected Set<String> whiteList = new HashSet<String>();;
+
+
public RobotRulesParser() {
}
@@ -112,6 +125,12 @@ public abstract class RobotRulesParser i
agentNames = sb.toString();
}
+
+ String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
+ if (confWhiteList != null && confWhiteList.length > 0) {
+ whiteList.addAll(Arrays.asList(confWhiteList));
+ LOG.info("Whitelisted hosts: " + whiteList);
+ }
}
/**
@@ -121,6 +140,14 @@ public abstract class RobotRulesParser i
return conf;
}
+
+ /**
+ * Check whether a URL belongs to a whitelisted host.
+ */
+ public boolean isWhiteListed(URL url) {
+ return whiteList.contains(url.getHost());
+ }
+
/**
* Parses the robots content using the {@link SimpleRobotRulesParser} from
* crawler commons
@@ -151,41 +178,127 @@ public abstract class RobotRulesParser i
return getRobotRulesSet(protocol, u);
}
+ /**
+ * Fetch robots.txt (or it's protocol-specific equivalent) which applies to
+ * the given URL, parse it and return the set of robot rules applicable for
+ * the configured agent name(s).
+ *
+ * @param protocol
+ * protocol implementation
+ * @param url
+ * URL to be checked whether fetching is allowed by robot rules
+ * @return robot rules
+ */
public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);
- /** command-line main for testing */
- public static void main(String[] argv) {
+ @Override
+ public int run(String[] args) {
- if (argv.length != 3) {
- System.err
- .println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
- System.err
- .println("\tThe <robots-file> will be parsed as a robots.txt file,");
- System.err
- .println("\tusing the given <agent-name> to select rules. URLs ");
- System.err
- .println("\twill be read (one per line) from <url-file>, and tested");
- System.err
- .println("\tagainst the rules. Multiple agent names can be provided using");
- System.err.println("\tcomma as a delimiter without any spaces.");
+ if (args.length < 2) {
+ String[] help = {
+ "Usage: RobotRulesParser <robots-file> <url-file> [<agent-names>]\n",
+ "\tThe <robots-file> will be parsed as a robots.txt file,",
+ "\tusing the given <agent-name> to select rules.",
+ "\tURLs will be read (one per line) from <url-file>,",
+ "\tand tested against the rules.",
+ "\tMultiple agent names can be provided using",
+ "\tcomma as a delimiter without any spaces.",
+ "\tIf no agent name is given the property http.agent.name",
+ "\tis used. If http.agent.name is empty, robots.txt is checked",
+ "\tfor rules assigned to the user agent `*' (meaning any other)." };
+ for (String s : help) {
+ System.err.println(s);
+ }
System.exit(-1);
}
+ File robotsFile = new File(args[0]);
+ File urlFile = new File(args[1]);
+
+ if (args.length > 2) {
+ // set agent name from command-line in configuration and update parser
+ String agents = args[2];
+ conf.set("http.agent.name", agents);
+ setConf(conf);
+ }
+
try {
- byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
- BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes,
- "text/plain", argv[2]);
+ BaseRobotRules rules = getRobotRulesSet(null, robotsFile.toURI().toURL());
- LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
- String testPath = testsIn.readLine().trim();
+ LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
+ String testPath;
+ testPath = testsIn.readLine().trim();
while (testPath != null) {
- System.out.println((rules.isAllowed(testPath) ? "allowed"
- : "not allowed") + ":\t" + testPath);
+ try {
+ // testPath can be just a path or a complete URL
+ URL url = new URL(testPath);
+ String status;
+ if (isWhiteListed(url)) {
+ status = "whitelisted";
+ } else if (rules.isAllowed(testPath)) {
+ status = "allowed";
+ } else {
+ status = "not allowed";
+ }
+ System.out.println(status + ":\t" + testPath);
+ } catch (MalformedURLException e) {
+ }
testPath = testsIn.readLine();
}
testsIn.close();
- } catch (Exception e) {
- e.printStackTrace();
+ } catch (IOException e) {
+ LOG.error("Failed to run: " + StringUtils.stringifyException(e));
+ return -1;
}
+
+ return 0;
}
+
+ /**
+ * {@link RobotRulesParser} implementation which expects the location of the
+ * robots.txt passed by URL (usually pointing to a local file) in
+ * {@link getRobotRulesSet}.
+ */
+ private static class TestRobotRulesParser extends RobotRulesParser {
+
+ public TestRobotRulesParser(Configuration conf) {
+ // make sure that agent name is set so that setConf() does not complain,
+ // the agent name is later overwritten by command-line argument
+ if (conf.get("http.agent.name") == null) {
+ conf.set("http.agent.name", "*");
+ }
+ setConf(conf);
+ }
+
+ /**
+ * @param protocol (ignored)
+ * @param url
+ * location of the robots.txt file
+ * */
+ public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
+ BaseRobotRules rules;
+ try {
+ int contentLength = url.openConnection().getContentLength();
+ byte[] robotsBytes = new byte[contentLength];
+ InputStream openStream = url.openStream();
+ openStream.read(robotsBytes);
+ openStream.close();
+ rules = robotParser.parseContent(url.toString(), robotsBytes,
+ "text/plain", this.conf.get("http.agent.name"));
+ } catch (IOException e) {
+ LOG.error("Failed to open robots.txt file " + url
+ + StringUtils.stringifyException(e));
+ rules = EMPTY_RULES;
+ }
+ return rules;
+ }
+
+ }
+
+ public static void main(String[] args) throws Exception {
+ Configuration conf = NutchConfiguration.create();
+ int res = ToolRunner.run(conf, new TestRobotRulesParser(conf), args);
+ System.exit(res);
+ }
+
}
Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Fri Apr 17 20:49:19 2015
@@ -29,7 +29,6 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.RobotRulesParser;
import crawlercommons.robots.BaseRobotRules;
-import crawlercommons.robots.SimpleRobotRules;
/**
* This class is used for parsing robots for urls belonging to HTTP protocol. It
@@ -87,16 +86,31 @@ public class HttpRobotRulesParser extend
*/
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
+ if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ }
+
String cacheKey = getCacheKey(url);
- BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey);
+ BaseRobotRules robotRules = CACHE.get(cacheKey);
+
+ if (robotRules != null) {
+ return robotRules; // cached rule
+ } else if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
boolean cacheRule = true;
+ URL redir = null;
- if (robotRules == null) { // cache miss
- URL redir = null;
- if (LOG.isTraceEnabled()) {
- LOG.trace("cache miss " + url);
- }
+ if (isWhiteListed(url)) {
+ // check in advance whether a host is whitelisted
+ // (we do not need to fetch robots.txt)
+ robotRules = EMPTY_RULES;
+ LOG.info("Whitelisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+ url.getHost());
+
+ } else {
try {
Response response = ((HttpBase) http).getResponse(new URL(url,
"/robots.txt"), new CrawlDatum(), true);
@@ -127,7 +141,7 @@ public class HttpRobotRulesParser extend
else if ((response.getCode() == 403) && (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getCode() >= 500) {
- cacheRule = false;
+ cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
} else
robotRules = EMPTY_RULES; // use default rules
@@ -135,18 +149,19 @@ public class HttpRobotRulesParser extend
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
- cacheRule = false;
+ cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
}
+ }
- if (cacheRule) {
- CACHE.put(cacheKey, robotRules); // cache rules for host
- if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
- // cache also for the redirected host
- CACHE.put(getCacheKey(redir), robotRules);
- }
+ if (cacheRule) {
+ CACHE.put(cacheKey, robotRules); // cache rules for host
+ if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
+ // cache also for the redirected host
+ CACHE.put(getCacheKey(redir), robotRules);
}
}
+
return robotRules;
}
}
Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1674399&r1=1674398&r2=1674399&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (original)
+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Fri Apr 17 20:49:19 2015
@@ -69,15 +69,28 @@ public class FtpRobotRulesParser extends
// case
String host = url.getHost().toLowerCase(); // normalize to lower case
- BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":"
- + host);
+ if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ }
+
+ BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
+
+ if (robotRules != null) {
+ return robotRules; // cached rule
+ } else if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
boolean cacheRule = true;
- if (robotRules == null) { // cache miss
- if (LOG.isTraceEnabled())
- LOG.trace("cache miss " + url);
+ if (isWhiteListed(url)) {
+ // check in advance whether a host is whitelisted
+ // (we do not need to fetch robots.txt)
+ robotRules = EMPTY_RULES;
+ LOG.info("Whitelisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+ } else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
@@ -94,13 +107,15 @@ public class FtpRobotRulesParser extends
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
- cacheRule = false;
+ cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
}
- if (cacheRule)
- CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
}
+
+ if (cacheRule)
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+
return robotRules;
}
}