You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/24 10:54:30 UTC
svn commit: r747319 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/fetcher/
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/test/
src/test/org/apache/nutch/fetcher/
Author: siren
Date: Tue Feb 24 09:54:30 2009
New Revision: 747319
URL: http://svn.apache.org/viewvc?rev=747319&view=rev
Log:
NUTCH-247 - Robot parser to restrict, contributed by kubes
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/test/crawl-tests.xml
lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:54:30 2009
@@ -357,6 +357,8 @@
133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
set at cross domain redirects (Remco Verhoef, dogacan via siren)
+134. NUTCH-247 - Robot parser to restrict (kubes, siren)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:54:30 2009
@@ -933,6 +933,8 @@
public void fetch(Path segment, int threads, boolean parsing)
throws IOException {
+ checkConfiguration();
+
if (LOG.isInfoEnabled()) {
LOG.info("Fetcher: starting");
LOG.info("Fetcher: segment: " + segment);
@@ -995,4 +997,40 @@
fetcher.fetch(segment, threads, parsing); // run the Fetcher
}
+
+ private void checkConfiguration() {
+
+ // ensure that a value has been set for the agent name and that that
+ // agent name is the first value in the agents we advertise for robot
+ // rules parsing
+ String agentName = getConf().get("http.agent.name");
+ if (agentName == null || agentName.trim().length() == 0) {
+ String message = "Fetcher: No agents listed in 'http.agent.name'"
+ + " property.";
+ if (LOG.isFatalEnabled()) {
+ LOG.fatal(message);
+ }
+ throw new IllegalArgumentException(message);
+ } else {
+
+ // get all of the agents that we advertise
+ String agentNames = getConf().get("http.robots.agents");
+ StringTokenizer tok = new StringTokenizer(agentNames, ",");
+ ArrayList<String> agents = new ArrayList<String>();
+ while (tok.hasMoreTokens()) {
+ agents.add(tok.nextToken().trim());
+ }
+
+ // if the first one is not equal to our agent name, log fatal and throw
+ // an exception
+ if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
+ String message = "Fetcher: Your 'http.agent.name' value should be "
+ + "listed first in 'http.robots.agents' property.";
+ if (LOG.isWarnEnabled()) {
+ LOG.warn(message);
+ }
+ }
+ }
+ }
+
}
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Feb 24 09:54:30 2009
@@ -223,9 +223,6 @@
// Grab the agent names we advertise to robots files.
//
String agentName = conf.get("http.agent.name");
- if (null == agentName) {
- throw new RuntimeException("Agent name not configured!");
- }
String agentNames = conf.get("http.robots.agents");
StringTokenizer tok = new StringTokenizer(agentNames, ",");
ArrayList agents = new ArrayList();
@@ -233,23 +230,6 @@
agents.add(tok.nextToken().trim());
}
- //
- // If there are no agents for robots-parsing, use our
- // default agent-string. If both are present, our agent-string
- // should be the first one we advertise to robots-parsing.
- //
- if (agents.size() == 0) {
- agents.add(agentName);
- if (LOG.isFatalEnabled()) {
- LOG.fatal("No agents listed in 'http.robots.agents' property!");
- }
- } else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
- agents.add(0, agentName);
- if (LOG.isFatalEnabled()) {
- LOG.fatal("Agent we advertise (" + agentName
- + ") not listed first in 'http.robots.agents' property!");
- }
- }
setRobotNames((String[]) agents.toArray(new String[agents.size()]));
}
Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Tue Feb 24 09:54:30 2009
@@ -33,5 +33,20 @@
<value>test-nutch,*</value>
</property>
+<property>
+ <name>http.agent.name.check</name>
+ <value>true</value>
+</property>
+
+<property>
+ <name>http.robots.agents</name>
+ <value>test-nutch,*</value>
+ <description>The agent strings we'll look for in robots.txt files,
+ comma-separated, in decreasing order of precedence. You should
+ put the value of http.agent.name as the first agent name, and keep the
+ default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+ </description>
+</property>
+
</configuration>
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Tue Feb 24 09:54:30 2009
@@ -167,5 +167,24 @@
private void addUrl(ArrayList<String> urls, String page) {
urls.add("http://127.0.0.1:" + server.getListeners()[0].getPort() + "/" + page);
}
+
+ public void testAgentNameCheck() {
+
+ boolean failedNoAgentName = false;
+ conf.set("http.agent.name", "");
+
+ try {
+ conf.setBoolean("fetcher.parse", true);
+ Fetcher2 fetcher = new Fetcher2(conf);
+ fetcher.fetch(null, 1, false);
+ } catch (IllegalArgumentException iae) {
+ String message = iae.getMessage();
+ failedNoAgentName = message.equals("Fetcher: No agents listed in "
+ + "'http.agent.name' property.");
+ } catch (Exception e) {
+ }
+
+ assertTrue(failedNoAgentName);
+ }
}